[llvm] [AMDGPU] Support waterfall loop for SOffset in VBUFFER (PR #95249)
Mariusz Sikora via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 12 06:45:17 PDT 2024
https://github.com/mariusz-sikora-at-amd created https://github.com/llvm/llvm-project/pull/95249
None
>From 5702ad498491abfa8df06f1e85968c174c872c11 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Wed, 12 Jun 2024 15:27:41 +0200
Subject: [PATCH] [AMDGPU] Support waterfall loop for SOffset in VBUFFER
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 +
.../AMDGPU/llvm.amdgcn.raw.buffer.store.ll | 219 ++++++++++
.../AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll | 315 +++++++++++++-
.../AMDGPU/llvm.amdgcn.struct.buffer.store.ll | 209 ++++++++++
.../llvm.amdgcn.struct.tbuffer.store.ll | 392 ++++++++++++++++++
5 files changed, 1140 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0edcdb337b5af..67f9c1ab3d23d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6718,6 +6718,13 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
+ if (isMUBUF(MI) || isMTBUF(MI)) {
+ MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
+ if (SOffset && !RI.isSGPRClass(MRI.getRegClass(SOffset->getReg()))) {
+ CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SOffset}, MDT);
+ }
+ }
+
int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index 75da1adc3123c..76c17524be7f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -575,6 +575,225 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
ret void
}
+define amdgpu_ps void @raw_buffer_store_waterfall_rsrc_vgpr(<4 x i32> %rsrc, i32 inreg %soffset, float %val) {
+; VERDE-LABEL: raw_buffer_store_waterfall_rsrc_vgpr:
+; VERDE: ; %bb.0:
+; VERDE-NEXT: s_mov_b64 s[2:3], exec
+; VERDE-NEXT: .LBB24_1: ; =>This Inner Loop Header: Depth=1
+; VERDE-NEXT: v_readfirstlane_b32 s4, v0
+; VERDE-NEXT: v_readfirstlane_b32 s5, v1
+; VERDE-NEXT: v_readfirstlane_b32 s6, v2
+; VERDE-NEXT: v_readfirstlane_b32 s7, v3
+; VERDE-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; VERDE-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], v[2:3]
+; VERDE-NEXT: s_and_b64 s[2:3], vcc, s[2:3]
+; VERDE-NEXT: s_and_saveexec_b64 s[2:3], s[2:3]
+; VERDE-NEXT: buffer_store_dword v4, off, s[4:7], s0
+; VERDE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; VERDE-NEXT: ; implicit-def: $vgpr4
+; VERDE-NEXT: s_xor_b64 exec, exec, s[2:3]
+; VERDE-NEXT: s_cbranch_execnz .LBB24_1
+; VERDE-NEXT: ; %bb.2:
+; VERDE-NEXT: s_endpgm
+;
+; GFX8-LABEL: raw_buffer_store_waterfall_rsrc_vgpr:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: .LBB24_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s6, v2
+; GFX8-NEXT: v_readfirstlane_b32 s7, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], v[2:3]
+; GFX8-NEXT: s_and_b64 s[2:3], vcc, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], s[2:3]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_store_dword v4, off, s[4:7], s0
+; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_cbranch_execnz .LBB24_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_waterfall_rsrc_vgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: .LBB24_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_and_saveexec_b32 s1, s1
+; GFX11-NEXT: buffer_store_b32 v4, off, s[4:7], s0
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0);
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_store_waterfall_soffset_vgpr(<4 x i32> inreg %rsrc, i32 %soffset, float %val) {
+; VERDE-LABEL: raw_buffer_store_waterfall_soffset_vgpr:
+; VERDE: ; %bb.0:
+; VERDE-NEXT: s_mov_b64 s[4:5], exec
+; VERDE-NEXT: .LBB25_1: ; =>This Inner Loop Header: Depth=1
+; VERDE-NEXT: v_readfirstlane_b32 s4, v0
+; VERDE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; VERDE-NEXT: s_and_saveexec_b64 vcc, vcc
+; VERDE-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; VERDE-NEXT: ; implicit-def: $vgpr0
+; VERDE-NEXT: ; implicit-def: $vgpr1
+; VERDE-NEXT: s_xor_b64 exec, exec, vcc
+; VERDE-NEXT: s_cbranch_execnz .LBB25_1
+; VERDE-NEXT: ; %bb.2:
+; VERDE-NEXT: s_endpgm
+;
+; GFX8-LABEL: raw_buffer_store_waterfall_soffset_vgpr:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: .LBB25_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX8-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX8-NEXT: s_nop 2
+; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; GFX8-NEXT: ; implicit-def: $vgpr0
+; GFX8-NEXT: ; implicit-def: $vgpr1
+; GFX8-NEXT: s_xor_b64 exec, exec, vcc
+; GFX8-NEXT: s_cbranch_execnz .LBB25_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_waterfall_soffset_vgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-NEXT: .LBB25_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], s4
+; GFX11-NEXT: ; implicit-def: $vgpr0
+; GFX11-NEXT: ; implicit-def: $vgpr1
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0);
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr(<4 x i32> %rsrc, i32 %soffset, float %val) {
+; VERDE-LABEL: raw_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; VERDE: ; %bb.0:
+; VERDE-NEXT: s_mov_b64 s[0:1], exec
+; VERDE-NEXT: .LBB26_1: ; =>This Loop Header: Depth=1
+; VERDE-NEXT: ; Child Loop BB26_2 Depth 2
+; VERDE-NEXT: v_readfirstlane_b32 s4, v0
+; VERDE-NEXT: v_readfirstlane_b32 s5, v1
+; VERDE-NEXT: v_readfirstlane_b32 s6, v2
+; VERDE-NEXT: v_readfirstlane_b32 s7, v3
+; VERDE-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; VERDE-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; VERDE-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; VERDE-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; VERDE-NEXT: s_mov_b64 s[2:3], exec
+; VERDE-NEXT: .LBB26_2: ; Parent Loop BB26_1 Depth=1
+; VERDE-NEXT: ; => This Inner Loop Header: Depth=2
+; VERDE-NEXT: v_readfirstlane_b32 s8, v4
+; VERDE-NEXT: v_cmp_eq_u32_e32 vcc, s8, v4
+; VERDE-NEXT: s_and_saveexec_b64 vcc, vcc
+; VERDE-NEXT: buffer_store_dword v5, off, s[4:7], s8
+; VERDE-NEXT: s_xor_b64 exec, exec, vcc
+; VERDE-NEXT: s_cbranch_execnz .LBB26_2
+; VERDE-NEXT: ; %bb.3: ; in Loop: Header=BB26_1 Depth=1
+; VERDE-NEXT: s_mov_b64 exec, s[2:3]
+; VERDE-NEXT: s_xor_b64 exec, exec, s[0:1]
+; VERDE-NEXT: s_cbranch_execnz .LBB26_1
+; VERDE-NEXT: ; %bb.4:
+; VERDE-NEXT: s_endpgm
+;
+; GFX8-LABEL: raw_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
+; GFX8-NEXT: .LBB26_1: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB26_2 Depth 2
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s6, v2
+; GFX8-NEXT: v_readfirstlane_b32 s7, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: .LBB26_2: ; Parent Loop BB26_1 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_readfirstlane_b32 s8, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s8, v4
+; GFX8-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX8-NEXT: s_nop 2
+; GFX8-NEXT: buffer_store_dword v5, off, s[4:7], s8
+; GFX8-NEXT: s_xor_b64 exec, exec, vcc
+; GFX8-NEXT: s_cbranch_execnz .LBB26_2
+; GFX8-NEXT: ; %bb.3: ; in Loop: Header=BB26_1 Depth=1
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB26_1
+; GFX8-NEXT: ; %bb.4:
+; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: .LBB26_1: ; =>This Loop Header: Depth=1
+; GFX11-NEXT: ; Child Loop BB26_2 Depth 2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: .LBB26_2: ; Parent Loop BB26_1 Depth=1
+; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT: v_readfirstlane_b32 s2, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v4
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v5, off, s[4:7], s2
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB26_2
+; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB26_1 Depth=1
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0);
+ ret void
+}
declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
index 8641bf1b03f36..ef2ed2e276257 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
@@ -3,8 +3,8 @@
;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=PREGFX10 %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX11 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefix=GFX12 %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefix=GFX12 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; PREGFX10-LABEL: tbuffer_store:
@@ -363,6 +363,317 @@ main_body:
ret void
}
+define amdgpu_ps void @raw_tbuffer_store_waterfall_soffset_vgpr(<4 x i32> inreg %rsrc, i32 %soffset, float %val) {
+; GFX10-LABEL: raw_tbuffer_store_waterfall_soffset_vgpr:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s4, exec_lo
+; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX10-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX10-NEXT: tbuffer_store_format_x v1, off, s[0:3], s4 format:[BUF_FMT_INVALID]
+; GFX10-NEXT: ; implicit-def: $vgpr0
+; GFX10-NEXT: ; implicit-def: $vgpr1
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX10-NEXT: s_cbranch_execnz .LBB11_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_tbuffer_store_waterfall_soffset_vgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: tbuffer_store_format_x v1, off, s[0:3], s4 format:[BUF_FMT_INVALID]
+; GFX11-NEXT: ; implicit-def: $vgpr0
+; GFX11-NEXT: ; implicit-def: $vgpr1
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: raw_tbuffer_store_waterfall_soffset_vgpr:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_mov_b32 s4, exec_lo
+; GFX12-SDAG-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX12-SDAG-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX12-SDAG-NEXT: tbuffer_store_format_x v1, off, s[0:3], s4 format:[BUF_FMT_INVALID]
+; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0
+; GFX12-SDAG-NEXT: ; implicit-def: $vgpr1
+; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-SDAG-NEXT: ; %bb.2:
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: raw_tbuffer_store_waterfall_soffset_vgpr:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX12-GISEL-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0
+; GFX12-GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_cmpx_eq_u32_e64 s5, v0
+; GFX12-GISEL-NEXT: tbuffer_store_format_x v1, off, s[0:3], s5 format:[BUF_FMT_INVALID]
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr1
+; GFX12-GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-GISEL-NEXT: ; %bb.2:
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0, i32 0);
+ ret void
+}
+
+define amdgpu_ps void @raw_tbuffer_store_waterfall_rsrc_vgpr(<4 x i32> %rsrc, i32 inreg %soffset, float %val) {
+; GFX10-LABEL: raw_tbuffer_store_waterfall_rsrc_vgpr:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s6, v2
+; GFX10-NEXT: v_readfirstlane_b32 s7, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX10-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX10-NEXT: s_and_saveexec_b32 s1, s1
+; GFX10-NEXT: tbuffer_store_format_x v4, off, s[4:7], s0 format:[BUF_FMT_INVALID]
+; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_cbranch_execnz .LBB12_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_tbuffer_store_waterfall_rsrc_vgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_and_saveexec_b32 s1, s1
+; GFX11-NEXT: tbuffer_store_format_x v4, off, s[4:7], s0 format:[BUF_FMT_INVALID]
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: raw_tbuffer_store_waterfall_rsrc_vgpr:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-SDAG-NEXT: s_and_saveexec_b32 s1, s1
+; GFX12-SDAG-NEXT: tbuffer_store_format_x v4, off, s[4:7], s0 format:[BUF_FMT_INVALID]
+; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4
+; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-SDAG-NEXT: ; %bb.2:
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: raw_tbuffer_store_waterfall_rsrc_vgpr:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1
+; GFX12-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
+; GFX12-GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-GISEL-NEXT: s_and_saveexec_b32 s1, s1
+; GFX12-GISEL-NEXT: tbuffer_store_format_x v4, off, s[4:7], s0 format:[BUF_FMT_INVALID]
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr5
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-GISEL-NEXT: ; %bb.2:
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0, i32 0);
+ ret void
+}
+
+define amdgpu_ps void @raw_tbuffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr(<4 x i32> %rsrc, i32 %soffset, float %val) {
+; GFX10-LABEL: raw_tbuffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-NEXT: .LBB13_1: ; =>This Loop Header: Depth=1
+; GFX10-NEXT: ; Child Loop BB13_2 Depth 2
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s6, v2
+; GFX10-NEXT: v_readfirstlane_b32 s7, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX10-NEXT: s_and_saveexec_b32 s0, s0
+; GFX10-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-NEXT: .LBB13_2: ; Parent Loop BB13_1 Depth=1
+; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT: v_readfirstlane_b32 s2, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v4
+; GFX10-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX10-NEXT: tbuffer_store_format_x v5, off, s[4:7], s2 format:[BUF_FMT_INVALID]
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX10-NEXT: s_cbranch_execnz .LBB13_2
+; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB13_1 Depth=1
+; GFX10-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execnz .LBB13_1
+; GFX10-NEXT: ; %bb.4:
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_tbuffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: .LBB13_1: ; =>This Loop Header: Depth=1
+; GFX11-NEXT: ; Child Loop BB13_2 Depth 2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: .LBB13_2: ; Parent Loop BB13_1 Depth=1
+; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT: v_readfirstlane_b32 s2, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v4
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: tbuffer_store_format_x v5, off, s[4:7], s2 format:[BUF_FMT_INVALID]
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB13_2
+; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB13_1 Depth=1
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: raw_tbuffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX12-SDAG-NEXT: .LBB13_1: ; =>This Loop Header: Depth=1
+; GFX12-SDAG-NEXT: ; Child Loop BB13_2 Depth 2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-SDAG-NEXT: .LBB13_2: ; Parent Loop BB13_1 Depth=1
+; GFX12-SDAG-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v4
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v4
+; GFX12-SDAG-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX12-SDAG-NEXT: tbuffer_store_format_x v5, off, s[4:7], s2 format:[BUF_FMT_INVALID]
+; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB13_2
+; GFX12-SDAG-NEXT: ; %bb.3: ; in Loop: Header=BB13_1 Depth=1
+; GFX12-SDAG-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-SDAG-NEXT: ; %bb.4:
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: raw_tbuffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
+; GFX12-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX12-GISEL-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v6
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v7
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s2, v4
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[6:7]
+; GFX12-GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, s2, v4
+; GFX12-GISEL-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-GISEL-NEXT: tbuffer_store_format_x v5, off, s[4:7], s2 format:[BUF_FMT_INVALID]
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr6
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr5
+; GFX12-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-GISEL-NEXT: ; %bb.2:
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0, i32 0);
+ ret void
+}
+
declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
index 94c9f7ab8e756..8a0572f52f031 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -385,6 +385,215 @@ define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16
ret void
}
+define amdgpu_ps void @struct_buffer_store_waterfall_soffset_vgprv4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %soffset) {
+; VERDE-LABEL: struct_buffer_store_waterfall_soffset_vgprv4i16:
+; VERDE: ; %bb.0:
+; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT: s_mov_b32 s4, 0
+; VERDE-NEXT: v_or_b32_e32 v2, v2, v3
+; VERDE-NEXT: v_or_b32_e32 v1, v0, v1
+; VERDE-NEXT: v_mov_b32_e32 v3, s4
+; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 idxen offen
+; VERDE-NEXT: s_endpgm
+;
+; GFX8-LABEL: struct_buffer_store_waterfall_soffset_vgprv4i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b32 s4, 0
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[0:3], 0 idxen offen
+; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_waterfall_soffset_vgprv4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: buffer_store_b64 v[3:4], v[1:2], s[0:3], 0 idxen offen
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @struct_buffer_store_waterfall_rsrc_vgpr(<4 x i32> %rsrc, <4 x i16> %v1, i32 inreg %soffset) {
+; VERDE-LABEL: struct_buffer_store_waterfall_rsrc_vgpr:
+; VERDE: ; %bb.0:
+; VERDE-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; VERDE-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; VERDE-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; VERDE-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; VERDE-NEXT: s_mov_b32 s1, 0
+; VERDE-NEXT: v_or_b32_e32 v6, v6, v7
+; VERDE-NEXT: v_or_b32_e32 v5, v4, v5
+; VERDE-NEXT: v_mov_b32_e32 v7, s1
+; VERDE-NEXT: v_mov_b32_e32 v8, s0
+; VERDE-NEXT: s_mov_b64 s[0:1], exec
+; VERDE-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; VERDE-NEXT: v_readfirstlane_b32 s4, v0
+; VERDE-NEXT: v_readfirstlane_b32 s5, v1
+; VERDE-NEXT: v_readfirstlane_b32 s6, v2
+; VERDE-NEXT: v_readfirstlane_b32 s7, v3
+; VERDE-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; VERDE-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; VERDE-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; VERDE-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; VERDE-NEXT: buffer_store_dwordx2 v[5:6], v[7:8], s[4:7], 0 idxen offen
+; VERDE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; VERDE-NEXT: ; implicit-def: $vgpr5_vgpr6
+; VERDE-NEXT: ; implicit-def: $vgpr7_vgpr8
+; VERDE-NEXT: s_xor_b64 exec, exec, s[0:1]
+; VERDE-NEXT: s_cbranch_execnz .LBB18_1
+; VERDE-NEXT: ; %bb.2:
+; VERDE-NEXT: s_endpgm
+;
+; GFX8-LABEL: struct_buffer_store_waterfall_rsrc_vgpr:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b32 s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
+; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s6, v2
+; GFX8-NEXT: v_readfirstlane_b32 s7, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_store_dwordx2 v[4:5], v[6:7], s[4:7], 0 idxen offen
+; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB18_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_waterfall_rsrc_vgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v6, s1
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_store_b64 v[4:5], v[6:7], s[4:7], 0 idxen offen
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @struct_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgprv(<4 x i32> %rsrc, <4 x i16> %v1, i32 %soffset) {
+; VERDE-LABEL: struct_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgprv:
+; VERDE: ; %bb.0:
+; VERDE-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; VERDE-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; VERDE-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; VERDE-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; VERDE-NEXT: s_mov_b32 s0, 0
+; VERDE-NEXT: v_or_b32_e32 v6, v6, v7
+; VERDE-NEXT: v_or_b32_e32 v5, v4, v5
+; VERDE-NEXT: v_mov_b32_e32 v7, s0
+; VERDE-NEXT: s_mov_b64 s[0:1], exec
+; VERDE-NEXT: .LBB19_1: ; =>This Inner Loop Header: Depth=1
+; VERDE-NEXT: v_readfirstlane_b32 s4, v0
+; VERDE-NEXT: v_readfirstlane_b32 s5, v1
+; VERDE-NEXT: v_readfirstlane_b32 s6, v2
+; VERDE-NEXT: v_readfirstlane_b32 s7, v3
+; VERDE-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; VERDE-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; VERDE-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; VERDE-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; VERDE-NEXT: buffer_store_dwordx2 v[5:6], v[7:8], s[4:7], 0 idxen offen
+; VERDE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; VERDE-NEXT: ; implicit-def: $vgpr5_vgpr6
+; VERDE-NEXT: ; implicit-def: $vgpr7_vgpr8
+; VERDE-NEXT: s_xor_b64 exec, exec, s[0:1]
+; VERDE-NEXT: s_cbranch_execnz .LBB19_1
+; VERDE-NEXT: ; %bb.2:
+; VERDE-NEXT: s_endpgm
+;
+; GFX8-LABEL: struct_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgprv:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b32 s0, 0
+; GFX8-NEXT: v_mov_b32_e32 v7, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
+; GFX8-NEXT: .LBB19_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s6, v2
+; GFX8-NEXT: v_readfirstlane_b32 s7, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_store_dwordx2 v[4:5], v[6:7], s[4:7], 0 idxen offen
+; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB19_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgprv:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, s0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: .LBB19_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_store_b64 v[4:5], v[6:7], s[4:7], 0 idxen offen
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0, i32 0)
+ ret void
+}
+
declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
index 47b7658f50cc5..5d33c1452851a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
@@ -616,6 +616,398 @@ main_body:
ret void
}
+define amdgpu_ps void @buffer_store_waterfall_rsrc_vgpr(<4 x i32> %rsrc, <4 x float> %data, i32 inreg %soffset) {
+; VERDE-LABEL: buffer_store_waterfall_rsrc_vgpr:
+; VERDE: ; %bb.0: ; %main_body
+; VERDE-NEXT: v_mov_b32_e32 v8, 0
+; VERDE-NEXT: s_mov_b64 s[2:3], exec
+; VERDE-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; VERDE-NEXT: v_readfirstlane_b32 s4, v0
+; VERDE-NEXT: v_readfirstlane_b32 s5, v1
+; VERDE-NEXT: v_readfirstlane_b32 s6, v2
+; VERDE-NEXT: v_readfirstlane_b32 s7, v3
+; VERDE-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; VERDE-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], v[2:3]
+; VERDE-NEXT: s_and_b64 s[2:3], vcc, s[2:3]
+; VERDE-NEXT: s_and_saveexec_b64 s[2:3], s[2:3]
+; VERDE-NEXT: tbuffer_store_format_xyzw v[4:7], v8, s[4:7], s0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen
+; VERDE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; VERDE-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; VERDE-NEXT: ; implicit-def: $vgpr8
+; VERDE-NEXT: s_xor_b64 exec, exec, s[2:3]
+; VERDE-NEXT: s_cbranch_execnz .LBB14_1
+; VERDE-NEXT: ; %bb.2:
+; VERDE-NEXT: s_endpgm
+;
+; PREGFX10-LABEL: buffer_store_waterfall_rsrc_vgpr:
+; PREGFX10: ; %bb.0: ; %main_body
+; PREGFX10-NEXT: v_mov_b32_e32 v8, 0
+; PREGFX10-NEXT: s_mov_b64 s[2:3], exec
+; PREGFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; PREGFX10-NEXT: v_readfirstlane_b32 s4, v0
+; PREGFX10-NEXT: v_readfirstlane_b32 s5, v1
+; PREGFX10-NEXT: v_readfirstlane_b32 s6, v2
+; PREGFX10-NEXT: v_readfirstlane_b32 s7, v3
+; PREGFX10-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; PREGFX10-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], v[2:3]
+; PREGFX10-NEXT: s_and_b64 s[2:3], vcc, s[2:3]
+; PREGFX10-NEXT: s_and_saveexec_b64 s[2:3], s[2:3]
+; PREGFX10-NEXT: s_nop 0
+; PREGFX10-NEXT: tbuffer_store_format_xyzw v[4:7], v8, s[4:7], s0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen
+; PREGFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; PREGFX10-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; PREGFX10-NEXT: ; implicit-def: $vgpr8
+; PREGFX10-NEXT: s_xor_b64 exec, exec, s[2:3]
+; PREGFX10-NEXT: s_cbranch_execnz .LBB14_1
+; PREGFX10-NEXT: ; %bb.2:
+; PREGFX10-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_store_waterfall_rsrc_vgpr:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s6, v2
+; GFX10-NEXT: v_readfirstlane_b32 s7, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX10-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX10-NEXT: s_and_saveexec_b32 s1, s1
+; GFX10-NEXT: tbuffer_store_format_xyzw v[4:7], v8, s[4:7], s0 format:[BUF_FMT_32_32_SINT] idxen
+; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX10-NEXT: ; implicit-def: $vgpr8
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_cbranch_execnz .LBB14_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: buffer_store_waterfall_rsrc_vgpr:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_and_saveexec_b32 s1, s1
+; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], v8, s[4:7], s0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr8
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_waterfall_rsrc_vgpr:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v8, 0
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_and_saveexec_b32 s1, s1
+; GFX12-NEXT: tbuffer_store_format_xyzw v[4:7], v8, s[4:7], s0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen
+; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr8
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 %soffset, i32 63, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_store_waterfall_soffset_vgpr(<4 x i32> inreg %rsrc, <4 x float> %data, i32 %soffset) {
+; VERDE-LABEL: buffer_store_waterfall_soffset_vgpr:
+; VERDE: ; %bb.0: ; %main_body
+; VERDE-NEXT: v_mov_b32_e32 v5, 0
+; VERDE-NEXT: s_mov_b64 s[4:5], exec
+; VERDE-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; VERDE-NEXT: v_readfirstlane_b32 s4, v4
+; VERDE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4
+; VERDE-NEXT: s_and_saveexec_b64 vcc, vcc
+; VERDE-NEXT: tbuffer_store_format_xyzw v[0:3], v5, s[0:3], s4 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen
+; VERDE-NEXT: ; implicit-def: $vgpr4
+; VERDE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; VERDE-NEXT: ; implicit-def: $vgpr5
+; VERDE-NEXT: s_xor_b64 exec, exec, vcc
+; VERDE-NEXT: s_cbranch_execnz .LBB15_1
+; VERDE-NEXT: ; %bb.2:
+; VERDE-NEXT: s_endpgm
+;
+; PREGFX10-LABEL: buffer_store_waterfall_soffset_vgpr:
+; PREGFX10: ; %bb.0: ; %main_body
+; PREGFX10-NEXT: v_mov_b32_e32 v5, 0
+; PREGFX10-NEXT: s_mov_b64 s[4:5], exec
+; PREGFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; PREGFX10-NEXT: v_readfirstlane_b32 s4, v4
+; PREGFX10-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4
+; PREGFX10-NEXT: s_and_saveexec_b64 vcc, vcc
+; PREGFX10-NEXT: s_nop 2
+; PREGFX10-NEXT: tbuffer_store_format_xyzw v[0:3], v5, s[0:3], s4 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen
+; PREGFX10-NEXT: ; implicit-def: $vgpr4
+; PREGFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; PREGFX10-NEXT: ; implicit-def: $vgpr5
+; PREGFX10-NEXT: s_xor_b64 exec, exec, vcc
+; PREGFX10-NEXT: s_cbranch_execnz .LBB15_1
+; PREGFX10-NEXT: ; %bb.2:
+; PREGFX10-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_store_waterfall_soffset_vgpr:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
+; GFX10-NEXT: s_mov_b32 s4, exec_lo
+; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v4
+; GFX10-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX10-NEXT: tbuffer_store_format_xyzw v[0:3], v5, s[0:3], s4 format:[BUF_FMT_32_32_SINT] idxen
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT: ; implicit-def: $vgpr5
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX10-NEXT: s_cbranch_execnz .LBB15_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: buffer_store_waterfall_soffset_vgpr:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v4
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v5, s[0:3], s4 format:[BUF_FMT_32_32_32_32_FLOAT] idxen
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr5
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_waterfall_soffset_vgpr:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v5, 0
+; GFX12-NEXT: s_mov_b32 s4, exec_lo
+; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v4
+; GFX12-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v5, s[0:3], s4 format:[BUF_FMT_32_32_32_32_FLOAT] idxen
+; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr5
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX12-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 %soffset, i32 63, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr(<4 x i32> %rsrc, <4 x float> %data, i32 %soffset) {
+; VERDE-LABEL: buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; VERDE: ; %bb.0: ; %main_body
+; VERDE-NEXT: v_mov_b32_e32 v9, 0
+; VERDE-NEXT: s_mov_b64 s[0:1], exec
+; VERDE-NEXT: .LBB16_1: ; =>This Loop Header: Depth=1
+; VERDE-NEXT: ; Child Loop BB16_2 Depth 2
+; VERDE-NEXT: v_readfirstlane_b32 s4, v0
+; VERDE-NEXT: v_readfirstlane_b32 s5, v1
+; VERDE-NEXT: v_readfirstlane_b32 s6, v2
+; VERDE-NEXT: v_readfirstlane_b32 s7, v3
+; VERDE-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; VERDE-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; VERDE-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; VERDE-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; VERDE-NEXT: s_mov_b64 s[2:3], exec
+; VERDE-NEXT: .LBB16_2: ; Parent Loop BB16_1 Depth=1
+; VERDE-NEXT: ; => This Inner Loop Header: Depth=2
+; VERDE-NEXT: v_readfirstlane_b32 s8, v8
+; VERDE-NEXT: v_cmp_eq_u32_e32 vcc, s8, v8
+; VERDE-NEXT: s_and_saveexec_b64 vcc, vcc
+; VERDE-NEXT: tbuffer_store_format_xyzw v[4:7], v9, s[4:7], s8 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen
+; VERDE-NEXT: s_xor_b64 exec, exec, vcc
+; VERDE-NEXT: s_cbranch_execnz .LBB16_2
+; VERDE-NEXT: ; %bb.3: ; in Loop: Header=BB16_1 Depth=1
+; VERDE-NEXT: s_mov_b64 exec, s[2:3]
+; VERDE-NEXT: s_xor_b64 exec, exec, s[0:1]
+; VERDE-NEXT: s_cbranch_execnz .LBB16_1
+; VERDE-NEXT: ; %bb.4:
+; VERDE-NEXT: s_endpgm
+;
+; PREGFX10-LABEL: buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; PREGFX10: ; %bb.0: ; %main_body
+; PREGFX10-NEXT: v_mov_b32_e32 v9, 0
+; PREGFX10-NEXT: s_mov_b64 s[0:1], exec
+; PREGFX10-NEXT: .LBB16_1: ; =>This Loop Header: Depth=1
+; PREGFX10-NEXT: ; Child Loop BB16_2 Depth 2
+; PREGFX10-NEXT: v_readfirstlane_b32 s4, v0
+; PREGFX10-NEXT: v_readfirstlane_b32 s5, v1
+; PREGFX10-NEXT: v_readfirstlane_b32 s6, v2
+; PREGFX10-NEXT: v_readfirstlane_b32 s7, v3
+; PREGFX10-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; PREGFX10-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; PREGFX10-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; PREGFX10-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; PREGFX10-NEXT: s_mov_b64 s[2:3], exec
+; PREGFX10-NEXT: .LBB16_2: ; Parent Loop BB16_1 Depth=1
+; PREGFX10-NEXT: ; => This Inner Loop Header: Depth=2
+; PREGFX10-NEXT: v_readfirstlane_b32 s8, v8
+; PREGFX10-NEXT: v_cmp_eq_u32_e32 vcc, s8, v8
+; PREGFX10-NEXT: s_and_saveexec_b64 vcc, vcc
+; PREGFX10-NEXT: s_nop 2
+; PREGFX10-NEXT: tbuffer_store_format_xyzw v[4:7], v9, s[4:7], s8 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen
+; PREGFX10-NEXT: s_xor_b64 exec, exec, vcc
+; PREGFX10-NEXT: s_cbranch_execnz .LBB16_2
+; PREGFX10-NEXT: ; %bb.3: ; in Loop: Header=BB16_1 Depth=1
+; PREGFX10-NEXT: s_mov_b64 exec, s[2:3]
+; PREGFX10-NEXT: s_xor_b64 exec, exec, s[0:1]
+; PREGFX10-NEXT: s_cbranch_execnz .LBB16_1
+; PREGFX10-NEXT: ; %bb.4:
+; PREGFX10-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v9, 0
+; GFX10-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-NEXT: .LBB16_1: ; =>This Loop Header: Depth=1
+; GFX10-NEXT: ; Child Loop BB16_2 Depth 2
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s6, v2
+; GFX10-NEXT: v_readfirstlane_b32 s7, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX10-NEXT: s_and_saveexec_b32 s0, s0
+; GFX10-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-NEXT: .LBB16_2: ; Parent Loop BB16_1 Depth=1
+; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT: v_readfirstlane_b32 s2, v8
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v8
+; GFX10-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX10-NEXT: tbuffer_store_format_xyzw v[4:7], v9, s[4:7], s2 format:[BUF_FMT_32_32_SINT] idxen
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX10-NEXT: s_cbranch_execnz .LBB16_2
+; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB16_1 Depth=1
+; GFX10-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execnz .LBB16_1
+; GFX10-NEXT: ; %bb.4:
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: v_mov_b32_e32 v9, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: .LBB16_1: ; =>This Loop Header: Depth=1
+; GFX11-NEXT: ; Child Loop BB16_2 Depth 2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: .LBB16_2: ; Parent Loop BB16_1 Depth=1
+; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT: v_readfirstlane_b32 s2, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v8
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], v9, s[4:7], s2 format:[BUF_FMT_32_32_32_32_FLOAT] idxen
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB16_2
+; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB16_1 Depth=1
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v9, 0
+; GFX12-NEXT: s_mov_b32 s0, exec_lo
+; GFX12-NEXT: .LBB16_1: ; =>This Loop Header: Depth=1
+; GFX12-NEXT: ; Child Loop BB16_2 Depth 2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: .LBB16_2: ; Parent Loop BB16_1 Depth=1
+; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT: v_readfirstlane_b32 s2, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v8
+; GFX12-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX12-NEXT: tbuffer_store_format_xyzw v[4:7], v9, s[4:7], s2 format:[BUF_FMT_32_32_32_32_FLOAT] idxen
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX12-NEXT: s_cbranch_execnz .LBB16_2
+; GFX12-NEXT: ; %bb.3: ; in Loop: Header=BB16_1 Depth=1
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 %soffset, i32 63, i32 0)
+ ret void
+}
+
declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
More information about the llvm-commits
mailing list