[llvm] [AMDGPU] Fix buffer addressing mode matching (PR #152584)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 13:01:22 PDT 2025
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/152584
Starting in gfx1250, voffset and immoffset are zero-extended from 32 bits
to 45 bits before being added together.
>From d6ff70e34c0e0dbc8ddfaf9bb749edadbd35368f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 7 Aug 2025 12:36:32 -0700
Subject: [PATCH] [AMDGPU] Fix buffer addressing mode matching
Starting in gfx1250, voffset and immoffset are zero-extended from 32 bits
to 45 bits before being added together.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 8 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 17 +-
.../llvm.amdgcn.raw.buffer.atomic.add.ll | 473 ++++---
.../llvm.amdgcn.raw.buffer.atomic.cmpswap.ll | 1096 ++++++++++-----
.../GlobalISel/llvm.amdgcn.raw.buffer.load.ll | 1219 +++++++++++------
.../llvm.amdgcn.raw.buffer.load.tfe.ll | 697 ++++++----
.../llvm.amdgcn.raw.buffer.store.ll | 1194 ++++++++++------
.../llvm.amdgcn.struct.buffer.atomic.add.ll | 610 ++++++---
...lvm.amdgcn.struct.buffer.atomic.cmpswap.ll | 1168 ++++++++++------
.../llvm.amdgcn.struct.buffer.load.ll | 1062 +++++++++-----
.../llvm.amdgcn.struct.buffer.load.tfe.ll | 757 ++++++----
.../llvm.amdgcn.struct.buffer.store.ll | 652 ++++++---
.../llvm.amdgcn.raw.atomic.buffer.load.ll | 705 +++++++---
.../llvm.amdgcn.raw.buffer.atomic.fadd.ll | 99 +-
.../AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll | 1 +
.../AMDGPU/llvm.amdgcn.raw.buffer.store.ll | 1 +
.../llvm.amdgcn.raw.ptr.atomic.buffer.load.ll | 705 +++++++---
...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 218 ++-
...amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll | 128 +-
...m.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll | 143 +-
.../llvm.amdgcn.raw.ptr.buffer.load.bf16.ll | 35 +-
.../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll | 23 +-
.../llvm.amdgcn.struct.atomic.buffer.load.ll | 868 ++++++++----
.../llvm.amdgcn.struct.buffer.load.tfe.ll | 1 +
.../AMDGPU/llvm.amdgcn.struct.buffer.store.ll | 110 ++
...vm.amdgcn.struct.ptr.atomic.buffer.load.ll | 868 ++++++++----
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 104 ++
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 110 ++
...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 420 ++++--
...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 420 ++++--
30 files changed, 9513 insertions(+), 4399 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a6e4a63de4c62..40d960e9b3a85 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5879,8 +5879,12 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
const LLT S32 = LLT::scalar(32);
MachineRegisterInfo &MRI = *B.getMRI();
- std::tie(BaseReg, ImmOffset) =
- AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
+ // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
+ // being added, so we can only safely match a 32-bit addition with no unsigned
+ // overflow.
+ bool CheckNUW = AMDGPU::isGFX1250(ST);
+ std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
+ MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
// If BaseReg is a pointer, convert it to int.
if (MRI.getType(BaseReg).isPointer())
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8f44c03d95b43..7fe1a2207058c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10877,6 +10877,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
}
+// Return whether the operation has NoUnsignedWrap property.
+static bool isNoUnsignedWrap(SDValue Addr) {
+ return (Addr.getOpcode() == ISD::ADD &&
+ Addr->getFlags().hasNoUnsignedWrap()) ||
+ Addr->getOpcode() == ISD::OR;
+}
+
bool SITargetLowering::shouldPreservePtrArith(const Function &F,
EVT PtrVT) const {
return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
@@ -10898,8 +10905,14 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
if ((C1 = dyn_cast<ConstantSDNode>(N0)))
N0 = SDValue();
else if (DAG.isBaseWithConstantOffset(N0)) {
- C1 = cast<ConstantSDNode>(N0.getOperand(1));
- N0 = N0.getOperand(0);
+ // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
+ // being added, so we can only safely match a 32-bit addition with no
+ // unsigned overflow.
+ bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
+ if (!CheckNUW || isNoUnsignedWrap(N0)) {
+ C1 = cast<ConstantSDNode>(N0.getOperand(1));
+ N0 = N0.getOperand(0);
+ }
}
if (C1) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
index 62f8f8959eba9..79a92918bfe85 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
; Natural mapping
define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
@@ -99,26 +100,47 @@ define amdgpu_ps <2 x float> @raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vg
; GFX8-NEXT: $vgpr1 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY8]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to <2 x float>
ret <2 x float> %cast
@@ -142,22 +164,39 @@ define amdgpu_ps void @raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgp
; GFX8-NEXT: BUFFER_ATOMIC_ADD_X2_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -217,58 +256,111 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -328,57 +420,109 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -400,21 +544,40 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
index 364ed62d9838c..9f1b7a670a40e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck --check-prefix=GFX1250 %s
; Natural mapping
@@ -24,24 +25,43 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
; GFX8-NEXT: $vgpr0 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -66,22 +86,39 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__
; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -145,62 +182,119 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_
; GFX8-NEXT: $vgpr0 = COPY [[COPY15]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY15]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY15]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -263,60 +357,115 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -341,24 +490,46 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
; GFX8-NEXT: $vgpr0 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
@@ -395,33 +566,61 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
ret double %cast
@@ -450,26 +649,47 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__
; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -542,71 +762,137 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
- ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+ ; GFX1200-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+ ; GFX1250-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
ret double %cast
@@ -673,64 +959,123 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -764,33 +1109,64 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%voffset = add i32 %voffset.base, 4095
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
index 46ca43b7af023..7003bb1a09eae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
; FIXME: Test with SI when argument lowering not broken for f16
; Natural mapping
@@ -124,52 +125,99 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -226,55 +274,105 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -509,23 +607,41 @@ define amdgpu_ps <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sg
; GFX8-NEXT: $vgpr1 = COPY [[COPY7]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY6]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY7]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <2 x float> %val
}
@@ -551,25 +667,45 @@ define amdgpu_ps <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sg
; GFX8-NEXT: $vgpr2 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX12-LABEL: name: raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub0
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub2
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY6]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr2 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ; GFX1200-LABEL: name: raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub0
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub2
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr2 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub0
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub2
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr2 = COPY [[COPY8]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <3 x float> %val
}
@@ -597,27 +733,49 @@ define amdgpu_ps <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sg
; GFX8-NEXT: $vgpr3 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX12-LABEL: name: raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub0
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub2
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub3
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY6]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr2 = COPY [[COPY8]]
- ; GFX12-NEXT: $vgpr3 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ; GFX1200-LABEL: name: raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub0
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub2
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub3
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr2 = COPY [[COPY8]]
+ ; GFX1200-NEXT: $vgpr3 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub0
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub2
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub3
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr2 = COPY [[COPY8]]
+ ; GFX1250-NEXT: $vgpr3 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <4 x float> %val
}
@@ -715,23 +873,41 @@ define amdgpu_ps <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgp
; GFX8-NEXT: $vgpr1 = COPY [[COPY7]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY6]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY7]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <4 x half> %val
}
@@ -929,52 +1105,99 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret half %val
}
@@ -1028,52 +1251,99 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%zext = zext i8 %val to i32
%cast = bitcast i32 %zext to float
@@ -1194,20 +1464,38 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 16
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -1229,20 +1517,38 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -1267,20 +1573,38 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4096, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4096, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4096
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -1522,54 +1846,103 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
- ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
+ ; GFX1200-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
+ ; GFX1250-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 5000
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -1627,52 +2000,102 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX1250-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 5000
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
index 3fbfb630ce08b..4784ac5de17b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -5,7 +5,8 @@
; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
; RUN: llc -global-isel -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX12
+; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX1200
+; RUN: llc -global-isel -mcpu=gfx1250 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX1250
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: name: raw_buffer_load_i8_tfe
@@ -110,27 +111,49 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i8_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i8, i32 } @llvm.amdgcn.raw.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { i8, i32 } %res, 0
store i8 %data, ptr addrspace(1) %data_addr
@@ -242,27 +265,49 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i16_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i16_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i16_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i16, i32 } @llvm.amdgcn.raw.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { i16, i32 } %res, 0
store i16 %data, ptr addrspace(1) %data_addr
@@ -374,27 +419,49 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_f16_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_f16_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f16_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { half, i32 } @llvm.amdgcn.raw.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { half, i32 } %res, 0
store half %data, ptr addrspace(1) %data_addr
@@ -506,27 +573,49 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i32, i32 } @llvm.amdgcn.raw.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { i32, i32 } %res, 0
store i32 %data, ptr addrspace(1) %data_addr
@@ -646,29 +735,53 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v2i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v2i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <2 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x i32>, i32 } %res, 0
store <2 x i32> %data, ptr addrspace(1) %data_addr
@@ -788,29 +901,53 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v2f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v2f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <2 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x float>, i32 } %res, 0
store <2 x float> %data, ptr addrspace(1) %data_addr
@@ -977,30 +1114,55 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v3i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v3i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <3 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x i32>, i32 } %res, 0
store <3 x i32> %data, ptr addrspace(1) %data_addr
@@ -1167,30 +1329,55 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v3f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v3f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <3 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x float>, i32 } %res, 0
store <3 x float> %data, ptr addrspace(1) %data_addr
@@ -1318,31 +1505,57 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v4i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v4i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160_align2 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <4 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x i32>, i32 } %res, 0
store <4 x i32> %data, ptr addrspace(1) %data_addr
@@ -1470,31 +1683,57 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v4f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v4f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160_align2 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <4 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x float>, i32 } %res, 0
store <4 x float> %data, ptr addrspace(1) %data_addr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index 63ca7be08d060..c365d5711f6ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
; FIXME: Test with SI when argument lowering not broken for f16
; Natural mapping
@@ -126,52 +127,99 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -309,55 +357,105 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -618,22 +716,39 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -657,23 +772,41 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -698,24 +831,43 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -876,22 +1028,39 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -946,54 +1115,103 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -1080,20 +1298,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 16
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1115,20 +1351,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4095
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1153,20 +1407,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4096, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4096, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4096
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1256,20 +1528,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 16
call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1291,20 +1581,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4095
call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1329,20 +1637,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4096, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4096, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4096
call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1400,52 +1726,102 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 5000
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1501,51 +1877,97 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
index 75d6c59560884..484639a23d284 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -22,23 +23,41 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -63,23 +82,41 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc_
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -109,28 +146,51 @@ define amdgpu_ps <2 x float> @struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc_
; GFX8-NEXT: $vgpr1 = COPY [[COPY10]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY10]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY10]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to <2 x float>
ret <2 x float> %cast
@@ -156,24 +216,43 @@ define amdgpu_ps void @struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__
; GFX8-NEXT: BUFFER_ATOMIC_ADD_X2_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -236,61 +315,117 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -353,60 +488,115 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -431,23 +621,41 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
%cast = bitcast i32 %ret to float
ret float %cast
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
index c9d1227cf27ab..7dab257aaaaf0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -25,26 +26,47 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
; GFX8-NEXT: $vgpr0 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -71,24 +93,43 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cm
; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -155,65 +196,125 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg
; GFX8-NEXT: $vgpr0 = COPY [[COPY17]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY17]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY17]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -279,63 +380,121 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -362,26 +521,50 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
; GFX8-NEXT: $vgpr0 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY9]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[V_ADD_U32_e64_]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY10]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
@@ -420,35 +603,65 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
ret double %cast
@@ -479,28 +692,51 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cm
; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -576,74 +812,143 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
- ; GFX12-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
- ; GFX12-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+ ; GFX1200-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+ ; GFX1250-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
ret double %cast
@@ -713,67 +1018,129 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
- ; GFX12-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -809,35 +1176,68 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY9]], [[COPY11]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[V_ADD_U32_e64_]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%voffset = add i32 %voffset.base, 4095
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
index 9b5e46b382d7e..dbef90f6c9ff9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -21,22 +22,39 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -63,25 +81,45 @@ define amdgpu_ps <2 x float> @struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__
; GFX8-NEXT: $vgpr1 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x float> %val
}
@@ -110,27 +148,49 @@ define amdgpu_ps <3 x float> @struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__
; GFX8-NEXT: $vgpr2 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX12-LABEL: name: struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub2
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY8]]
- ; GFX12-NEXT: $vgpr2 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ; GFX1200-LABEL: name: struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub2
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1200-NEXT: $vgpr2 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub2
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1250-NEXT: $vgpr2 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%val = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <3 x float> %val
}
@@ -161,29 +221,53 @@ define amdgpu_ps <4 x float> @struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__
; GFX8-NEXT: $vgpr3 = COPY [[COPY10]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX12-LABEL: name: struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub3
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY8]]
- ; GFX12-NEXT: $vgpr2 = COPY [[COPY9]]
- ; GFX12-NEXT: $vgpr3 = COPY [[COPY10]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ; GFX1200-LABEL: name: struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub3
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1200-NEXT: $vgpr2 = COPY [[COPY9]]
+ ; GFX1200-NEXT: $vgpr3 = COPY [[COPY10]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub3
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1250-NEXT: $vgpr2 = COPY [[COPY9]]
+ ; GFX1250-NEXT: $vgpr3 = COPY [[COPY10]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%val = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <4 x float> %val
}
@@ -208,23 +292,41 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -248,22 +350,42 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_ADD_U32_e64_]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -287,22 +409,39 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 64, i32 0)
ret float %val
}
@@ -363,59 +502,113 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -438,22 +631,39 @@ define amdgpu_ps float @struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ext = zext i8 %val to i32
%cast = bitcast i32 %ext to float
@@ -478,22 +688,39 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ext = sext i8 %val to i32
%cast = bitcast i32 %ext to float
@@ -519,23 +746,41 @@ define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg %
; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i8_sext_wrong_width
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
+ ; GFX1200-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i8_sext_wrong_width
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%trunc = trunc i8 %val to i4
%ext = sext i4 %trunc to i32
@@ -561,22 +806,39 @@ define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgp
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ext = zext i16 %val to i32
%cast = bitcast i32 %ext to float
@@ -601,22 +863,39 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ext = sext i16 %val to i32
%cast = bitcast i32 %ext to float
@@ -642,23 +921,41 @@ define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg
; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i16_sext_wrong_width
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i16_sext_wrong_width
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
+ ; GFX1200-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i16_sext_wrong_width
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%trunc = trunc i16 %val to i8
%ext = sext i8 %trunc to i32
@@ -685,22 +982,39 @@ define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voff
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret half %val
}
@@ -724,22 +1038,39 @@ define amdgpu_ps <2 x half> @struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__v
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %val
}
@@ -772,25 +1103,45 @@ define amdgpu_ps <4 x half> @struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__v
; GFX8-NEXT: $vgpr1 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <4 x half> %val
}
@@ -814,22 +1165,39 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1)
ret float %val
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
index 674fe1c194e88..39cce20cc63f7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -5,7 +5,8 @@
; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
; RUN: llc -global-isel -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX12
+; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX1200
+; RUN: llc -global-isel -mcpu=gfx1250 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX1250
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: name: raw_buffer_load_i8_tfe
@@ -114,29 +115,53 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i8_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i8, i32 } @llvm.amdgcn.struct.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { i8, i32 } %res, 0
store i8 %data, ptr addrspace(1) %data_addr
@@ -252,29 +277,53 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i16_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i16_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i16_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i16, i32 } @llvm.amdgcn.struct.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { i16, i32 } %res, 0
store i16 %data, ptr addrspace(1) %data_addr
@@ -390,29 +439,53 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_f16_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_f16_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f16_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { half, i32 } @llvm.amdgcn.struct.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { half, i32 } %res, 0
store half %data, ptr addrspace(1) %data_addr
@@ -528,29 +601,53 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { i32, i32 } %res, 0
store i32 %data, ptr addrspace(1) %data_addr
@@ -674,31 +771,57 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v2i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v2i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x i32>, i32 } %res, 0
store <2 x i32> %data, ptr addrspace(1) %data_addr
@@ -822,31 +945,57 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v2f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v2f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x float>, i32 } %res, 0
store <2 x float> %data, ptr addrspace(1) %data_addr
@@ -1018,32 +1167,59 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v3i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v3i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x i32>, i32 } %res, 0
store <3 x i32> %data, ptr addrspace(1) %data_addr
@@ -1215,32 +1391,59 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v3f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v3f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x float>, i32 } %res, 0
store <3 x float> %data, ptr addrspace(1) %data_addr
@@ -1372,33 +1575,61 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v4i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v4i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160_align2 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x i32>, i32 } %res, 0
store <4 x i32> %data, ptr addrspace(1) %data_addr
@@ -1530,33 +1761,61 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v4f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v4f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160_align2 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x float>, i32 } %res, 0
store <4 x float> %data, ptr addrspace(1) %data_addr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
index 8183d8532cdfb..c9771b5aca0db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
; Natural mapping
define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -21,22 +22,39 @@ define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex_
; GFX8-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -61,24 +79,43 @@ define amdgpu_ps void @struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -104,25 +141,45 @@ define amdgpu_ps void @struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -149,26 +206,47 @@ define amdgpu_ps void @struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -233,64 +311,123 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -313,22 +450,39 @@ define amdgpu_ps void @struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__
; GFX8-NEXT: BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_BYTE_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_BYTE_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_BYTE_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%val.trunc = trunc i32 %val to i8
call void @llvm.amdgcn.struct.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -352,22 +506,39 @@ define amdgpu_ps void @struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex_
; GFX8-NEXT: BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_SHORT_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_SHORT_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_SHORT_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%val.trunc = trunc i32 %val to i16
call void @llvm.amdgcn.struct.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -391,22 +562,39 @@ define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex_
; GFX8-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1)
ret void
}
@@ -429,22 +617,39 @@ define amdgpu_ps void @struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -475,24 +680,43 @@ define amdgpu_ps void @struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index 7a20b5c9c51ae..a2c1545743039 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -1,27 +1,52 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB0_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB0_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB0_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -34,23 +59,42 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_off:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB1_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB1_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i32_off:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB1_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i32_off:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB1_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -62,23 +106,43 @@ bb2:
ret void
}
define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_soff:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB2_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB2_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i32_soff:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB2_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i32_soff:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s5, 4
+; GFX12-NEXT: .LBB2_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -90,23 +154,42 @@ bb2:
ret void
}
define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB3_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i32_dlc:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB3_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i32_dlc:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB3_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -119,24 +202,44 @@ bb2:
}
define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_nonatomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: .LBB4_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s0, s1, s0
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_nonatomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: .LBB4_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_nonatomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: .LBB4_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s1, s0
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -149,23 +252,43 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i64:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB5_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB5_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i64:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB5_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i64:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB5_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
@@ -179,23 +302,42 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB6_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB6_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_v2i16:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB6_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_v2i16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB6_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -209,68 +351,151 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
-; CHECK-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK-SDAG-TRUE16: ; %bb.0: ; %bb
-; CHECK-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-SDAG-TRUE16-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
-; CHECK-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK-FAKE16: ; %bb.0: ; %bb
-; CHECK-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-FAKE16-NEXT: s_mov_b32 s4, 0
-; CHECK-FAKE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-FAKE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-FAKE16-NEXT: s_endpgm
+; GFX11-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-FAKE16-NEXT: s_endpgm
;
-; CHECK-GISEL-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK-GISEL: ; %bb.0: ; %bb
-; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-NEXT: .LBB7_1: ; %bb1
-; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v1
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v2
-; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-NEXT: s_endpgm
+; GFX11-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-GISEL: ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-NEXT: .LBB7_1: ; %bb1
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX12-FAKE16: ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -285,23 +510,42 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_v4i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB8_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_v4i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB8_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_v4i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB8_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -315,25 +559,46 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_ptr:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB9_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v1, v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB9_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_ptr:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB9_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_b32 v1, v[1:2]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_ptr:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB9_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: flat_load_b32 v1, v[2:3]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
index 5c0e34c5e2ec0..d51e912a41bf9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
@@ -1,58 +1,95 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX12 %s
define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen offset:24
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v1, 24, v1
+; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 24
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
}
define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[16:19], s20
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_add_f32 v0, off, s[16:19], s20
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
ret void
}
define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], s20 offen
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], s20 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[16:19], s20 offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_pk_add_f16 v0, off, s[16:19], s20 offset:92
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
ret void
}
define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen slc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen slc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
index 8a6594f93bd9d..1a1a1f784464d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -6,6 +6,7 @@
; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10
; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11
; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: raw_buffer_load_i8_tfe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index 89511deaa2bbc..eeea1456792af 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -3,6 +3,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GFX68,GFX8 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; GFX68-LABEL: buffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 561ec7de94bc6..6f7c001e03e26 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -1,27 +1,52 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB0_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB0_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB0_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -34,23 +59,42 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_off:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB1_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB1_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_off:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB1_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_off:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB1_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -62,23 +106,43 @@ bb2:
ret void
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB2_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB2_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB2_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s5, 4
+; GFX12-NEXT: .LBB2_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -90,23 +154,42 @@ bb2:
ret void
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB3_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB3_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB3_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -119,24 +202,44 @@ bb2:
}
define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_nonptr_atomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: .LBB4_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s0, s1, s0
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_nonptr_atomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: .LBB4_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_nonptr_atomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: .LBB4_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s1, s0
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -149,23 +252,43 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i64:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB5_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB5_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_i64:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB5_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i64:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB5_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
@@ -179,23 +302,42 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB6_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB6_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_v2i16:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB6_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_v2i16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB6_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -209,68 +351,151 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) {
-; CHECK-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK-SDAG-TRUE16: ; %bb.0: ; %bb
-; CHECK-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-SDAG-TRUE16-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
-; CHECK-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK-FAKE16: ; %bb.0: ; %bb
-; CHECK-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-FAKE16-NEXT: s_mov_b32 s4, 0
-; CHECK-FAKE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-FAKE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-FAKE16-NEXT: s_endpgm
+; GFX11-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-FAKE16-NEXT: s_endpgm
;
-; CHECK-GISEL-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK-GISEL: ; %bb.0: ; %bb
-; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-NEXT: .LBB7_1: ; %bb1
-; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v1
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v2
-; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-NEXT: s_endpgm
+; GFX11-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL: ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-NEXT: .LBB7_1: ; %bb1
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX12-FAKE16: ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -285,23 +510,42 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB8_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_v4i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB8_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_v4i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB8_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -315,25 +559,46 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_ptr:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB9_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v1, v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB9_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_ptr:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB9_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_b32 v1, v[1:2]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_ptr:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB9_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: flat_load_b32 v1, v[2:3]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index 8b6ba1a3cc094..2c3b5210ae7bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -1,104 +1,174 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; FIXME: Test 90a, 940. 908 should fail to select.
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, 0x80, v1
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 128
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
}
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
ret <2 x bfloat> %ret
}
define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, 0x80, v1
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 128
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
}
define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
ret void
}
; Test waterfall loop
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: v_readfirstlane_b32 s3, v6
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr6
-; GFX12-NEXT: ; implicit-def: $vgpr5
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: v_readfirstlane_b32 s3, v6
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
+; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_b32 s0, s0, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr6
+; GFX1200-NEXT: ; implicit-def: $vgpr5
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, v4 :: v_dual_mov_b32 v10, v3
+; GFX1250-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v8, v1
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, 0x80, v5
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v8
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v9
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v10
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v11
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[8:9]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[4:7], s3 offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11
+; GFX1250-NEXT: ; implicit-def: $vgpr6
+; GFX1250-NEXT: ; implicit-def: $vgpr1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 128
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
index 8141e0df46737..ea8f836ab49a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
@@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -26,15 +27,22 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen scope:SCOPE_SYS
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen scope:SCOPE_SYS
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen scope:SCOPE_SYS
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret void
}
@@ -61,15 +69,22 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret void
}
@@ -96,15 +111,22 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -131,15 +153,22 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0)
ret void
}
@@ -166,15 +195,22 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
index 767117dc99fd4..28387403c5048 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -18,16 +19,24 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret float %ret
}
@@ -47,16 +56,24 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret float %ret
}
@@ -76,16 +93,24 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
}
@@ -105,16 +130,24 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0)
ret <2 x half> %ret
}
@@ -134,16 +167,24 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
index 3540468566147..4dd258b7bda82 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX12 %s
define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
; GFX7-LABEL: raw_ptr_buffer_load_bf16:
@@ -41,6 +42,14 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_ptr_buffer_load_bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v0, off, s[0:3], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%val = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret bfloat %val
}
@@ -82,6 +91,14 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%val = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret <2 x bfloat> %val
}
@@ -125,6 +142,14 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%val = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret <4 x bfloat> %val
}
@@ -178,6 +203,14 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%val = call <8 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v8bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret <8 x bfloat> %val
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
index e1f84dcbaa607..ec7d7d467ffc6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX12 %s
define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) {
; GFX7-LABEL: buffer_store_bf16:
@@ -32,6 +33,11 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %d
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b16 v0, v1, s[0:3], null offen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
ret void
}
@@ -65,6 +71,11 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_v2bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
ret void
}
@@ -102,6 +113,11 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_v4bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
ret void
}
@@ -153,6 +169,11 @@ define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_v8bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null offen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index f6f614e6e41b5..88963643218a5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -1,30 +1,58 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB0_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB0_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB0_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -37,23 +65,43 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %addr) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32_const_idx:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB1_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB1_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32_const_idx:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB1_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32_const_idx:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB1_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -66,26 +114,48 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32_off:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB2_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB2_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32_off:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB2_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32_off:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB2_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -98,26 +168,49 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32_soff:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB3_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32_soff:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB3_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32_soff:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s5, 4
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB3_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], s5 idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -129,26 +222,48 @@ bb2:
ret void
}
define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB4_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32_dlc:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB4_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32_dlc:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB4_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -161,26 +276,49 @@ bb2:
}
define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_nonatomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: .LBB5_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s0, s1, s0
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execnz .LBB5_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_nonatomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: .LBB5_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_nonatomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: .LBB5_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s1, s0
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -193,26 +331,49 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i64:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s6
-; CHECK-NEXT: .LBB6_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB6_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i64:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: .LBB6_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i64:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: .LBB6_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
@@ -226,26 +387,48 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB7_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_v2i16:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB7_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_v2i16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB7_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -259,77 +442,172 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 %index) {
-; CHECK-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
-; CHECK-SDAG-TRUE16: ; %bb.0: ; %bb
-; CHECK-SDAG-TRUE16-NEXT: s_clause 0x1
-; CHECK-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-SDAG-TRUE16-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-GISEL: ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-GISEL-NEXT: .LBB8_1: ; %bb1
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-NEXT: s_endpgm
;
-; CHECK-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
-; CHECK-FAKE16: ; %bb.0: ; %bb
-; CHECK-FAKE16-NEXT: s_clause 0x1
-; CHECK-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-FAKE16-NEXT: s_mov_b32 s4, 0
-; CHECK-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-FAKE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-FAKE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
;
-; CHECK-GISEL-LABEL: struct_atomic_buffer_load_v4i16:
-; CHECK-GISEL: ; %bb.0: ; %bb
-; CHECK-GISEL-NEXT: s_clause 0x1
-; CHECK-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-GISEL-NEXT: .LBB8_1: ; %bb1
-; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-NEXT: s_endpgm
+; GFX12-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX12-FAKE16: ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT: s_clause 0x1
+; GFX12-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-FAKE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -344,26 +622,48 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_v4i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB9_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB9_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_v4i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB9_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_v4i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB9_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -377,28 +677,52 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_ptr:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB10_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v2, v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB10_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_ptr:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB10_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_b32 v2, v[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_ptr:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB10_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: flat_load_b32 v2, v[2:3]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
index 13b28d408ba88..9abbc064803da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -6,6 +6,7 @@
; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10
; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11
; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: struct_buffer_load_i8_tfe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
index 9ce33c68c463f..8b46061cebcf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -3,6 +3,8 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GFX68,GFX8 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; GFX68-LABEL: buffer_store:
@@ -21,6 +23,15 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <
; GFX11-NEXT: buffer_store_b128 v[4:7], v12, s[0:3], 0 idxen glc
; GFX11-NEXT: buffer_store_b128 v[8:11], v12, s[0:3], 0 idxen slc
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v12, 0
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: buffer_store_b128 v[0:3], v12, s[0:3], null idxen
+; GFX12-NEXT: buffer_store_b128 v[4:7], v12, s[0:3], null idxen th:TH_STORE_NT
+; GFX12-NEXT: buffer_store_b128 v[8:11], v12, s[0:3], null idxen th:TH_STORE_HT
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -40,6 +51,12 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen offset:42
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_immoffs:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen offset:42
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
ret void
@@ -55,6 +72,11 @@ define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_idx:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
ret void
@@ -76,6 +98,12 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4
; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_ofs:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0
+; GFX12-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], null idxen offen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)
ret void
@@ -91,6 +119,11 @@ define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32)
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_both:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], null idxen offen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)
ret void
@@ -108,6 +141,12 @@ define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>,
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_store_b128 v[0:3], v[5:6], s[0:3], 0 idxen offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_both_reversed:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v4
+; GFX12-NEXT: buffer_store_b128 v[0:3], v[6:7], s[0:3], null idxen offen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)
ret void
@@ -139,6 +178,14 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32,
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_wait:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen
+; GFX12-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], null idxen
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0)
@@ -156,6 +203,11 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_x1:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
@@ -171,6 +223,11 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data,
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_x2:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
@@ -193,6 +250,15 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i
; GFX11-NEXT: buffer_store_b64 v[4:5], v7, s[0:3], 0 idxen glc
; GFX11-NEXT: buffer_store_b32 v6, v7, s[0:3], 0 idxen slc
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_int:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v7, 0
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: buffer_store_b128 v[0:3], v7, s[0:3], null idxen
+; GFX12-NEXT: buffer_store_b64 v[4:5], v7, s[0:3], null idxen th:TH_STORE_NT
+; GFX12-NEXT: buffer_store_b32 v6, v7, s[0:3], null idxen th:TH_STORE_HT
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -212,6 +278,12 @@ define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: buffer_store_b8 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_byte:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX12-NEXT: buffer_store_b8 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
%v2 = fptoui float %v1 to i32
%v3 = trunc i32 %v2 to i8
@@ -237,6 +309,18 @@ define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1,
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen
; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX12-TRUE16-LABEL: struct_buffer_store_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX12-TRUE16-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: struct_buffer_store_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX12-FAKE16-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen
+; GFX12-FAKE16-NEXT: s_endpgm
%v2 = fptrunc float %v1 to half
call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
@@ -261,6 +345,11 @@ define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x hal
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
}
@@ -288,6 +377,11 @@ define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x hal
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_v4f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
}
@@ -304,6 +398,12 @@ define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1,
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_i16:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX12-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
%v2 = fptoui float %v1 to i32
%v3 = trunc i32 %v2 to i16
@@ -329,6 +429,11 @@ define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_vif16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
}
@@ -354,6 +459,11 @@ define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_v4i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index 8f33dd6e3a69a..23db2479f66bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -1,30 +1,58 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB0_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB0_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB0_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -37,23 +65,43 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB1_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB1_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB1_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB1_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -66,26 +114,48 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_off:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB2_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB2_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_off:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB2_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_off:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB2_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -98,26 +168,49 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB3_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB3_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s5, 4
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB3_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], s5 idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -129,26 +222,48 @@ bb2:
ret void
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB4_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB4_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB4_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -161,26 +276,49 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_nonatomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: .LBB5_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s0, s1, s0
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execnz .LBB5_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_nonatomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: .LBB5_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_nonatomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: .LBB5_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s1, s0
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -193,26 +331,49 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i64:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s6
-; CHECK-NEXT: .LBB6_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB6_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i64:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: .LBB6_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i64:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: .LBB6_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
@@ -226,26 +387,48 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB7_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_v2i16:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB7_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_v2i16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB7_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -259,77 +442,172 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
-; CHECK-SDAG-TRUE16: ; %bb.0: ; %bb
-; CHECK-SDAG-TRUE16-NEXT: s_clause 0x1
-; CHECK-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-SDAG-TRUE16-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL: ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-GISEL-NEXT: .LBB8_1: ; %bb1
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-NEXT: s_endpgm
;
-; CHECK-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
-; CHECK-FAKE16: ; %bb.0: ; %bb
-; CHECK-FAKE16-NEXT: s_clause 0x1
-; CHECK-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-FAKE16-NEXT: s_mov_b32 s4, 0
-; CHECK-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-FAKE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-FAKE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
;
-; CHECK-GISEL-LABEL: struct_ptr_atomic_buffer_load_v4i16:
-; CHECK-GISEL: ; %bb.0: ; %bb
-; CHECK-GISEL-NEXT: s_clause 0x1
-; CHECK-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-GISEL-NEXT: .LBB8_1: ; %bb1
-; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-NEXT: s_endpgm
+; GFX12-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX12-FAKE16: ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT: s_clause 0x1
+; GFX12-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-FAKE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -344,26 +622,48 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB9_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB9_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_v4i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB9_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_v4i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB9_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -377,28 +677,52 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_ptr:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB10_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v2, v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB10_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_ptr:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB10_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_b32 v2, v[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_ptr:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB10_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: flat_load_b32 v2, v[2:3]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index 746b8791c39f9..4366472c73a0e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -39,6 +40,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -75,6 +84,13 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@@ -114,6 +130,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@@ -153,6 +177,14 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s16 idxen offen
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -291,6 +323,42 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s3 idxen offen
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr0
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -429,6 +497,42 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s3 idxen offen
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr0
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB5_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 71c63bfd69734..0191a85b33888 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -2,6 +2,7 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -32,6 +33,15 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}
@@ -62,6 +72,14 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
}
@@ -95,6 +113,15 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
}
@@ -128,6 +155,15 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
}
@@ -237,6 +273,43 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}
@@ -346,6 +419,43 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB5_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index e3889ab8f5a21..d551d91e5ab12 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -4,7 +4,8 @@
; Not supported in gfx8 or gfx9
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -35,16 +36,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}
@@ -78,16 +88,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v5, 0x100, v2
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[4:5], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
@@ -122,16 +141,24 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
}
@@ -165,16 +192,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
}
@@ -206,15 +242,23 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[2:3], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -246,15 +290,23 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v5, 0x100, v2
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[4:5], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -288,15 +340,22 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s16 idxen
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@@ -328,15 +387,23 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen slc
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@@ -442,36 +509,68 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s1, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX1200-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s1, s1
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX1200-NEXT: s_cbranch_execnz .LBB8_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v5, v4
+; GFX1250-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_add_nc_u32 v9, 0x100, v6
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[4:5]
+; GFX1250-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s1, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[8:9], s[4:7], s0 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX1250-NEXT: s_cbranch_execnz .LBB8_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
@@ -595,41 +694,78 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: v_readfirstlane_b32 s3, v7
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr7
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_b32 s0, s0, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr7
+; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v5, v4
+; GFX1250-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_add_nc_u32 v9, 0x100, v6
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[8:9], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index f001bf97fcd9e..0096289f39acc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -4,7 +4,8 @@
; Not supported in gfx8 or gfx9
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -35,16 +36,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}
@@ -78,16 +88,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v5, 0x100, v2
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[4:5], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
@@ -122,16 +141,24 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
}
@@ -165,16 +192,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
}
@@ -206,15 +242,23 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[2:3], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -246,15 +290,23 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v5, 0x100, v2
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[4:5], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -288,15 +340,22 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof
; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s16 idxen
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@@ -328,15 +387,23 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen slc
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@@ -442,36 +509,68 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s1, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX1200-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s1, s1
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX1200-NEXT: s_cbranch_execnz .LBB8_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v5, v4
+; GFX1250-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_add_nc_u32 v9, 0x100, v6
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[4:5]
+; GFX1250-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s1, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[8:9], s[4:7], s0 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX1250-NEXT: s_cbranch_execnz .LBB8_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
@@ -595,41 +694,78 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: v_readfirstlane_b32 s3, v7
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr7
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_b32 s0, s0, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr7
+; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v5, v4
+; GFX1250-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_add_nc_u32 v9, 0x100, v6
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[8:9], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
More information about the llvm-commits
mailing list