[llvm] [AMDGPU] Support waterfall loop for SOffset in VBUFFER (PR #95249)

Wed Jun 12 06:45:52 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Mariusz Sikora (mariusz-sikora-at-amd)

<details>
<summary>Changes</summary>



---

Patch is 57.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95249.diff


5 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+7) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll (+219) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll (+313-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll (+209) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll (+392) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0edcdb337b5af..67f9c1ab3d23d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6718,6 +6718,13 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
       CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
 
+    if (isMUBUF(MI) || isMTBUF(MI)) {
+      MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
+      if (SOffset && !RI.isSGPRClass(MRI.getRegClass(SOffset->getReg()))) {
+        CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SOffset}, MDT);
+      }
+    }
+
     int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
     MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index 75da1adc3123c..76c17524be7f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -575,6 +575,225 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
   ret void
 }
 
+define amdgpu_ps void @raw_buffer_store_waterfall_rsrc_vgpr(<4 x i32> %rsrc, i32 inreg %soffset, float %val) {
+; VERDE-LABEL: raw_buffer_store_waterfall_rsrc_vgpr:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    s_mov_b64 s[2:3], exec
+; VERDE-NEXT:  .LBB24_1: ; =>This Inner Loop Header: Depth=1
+; VERDE-NEXT:    v_readfirstlane_b32 s4, v0
+; VERDE-NEXT:    v_readfirstlane_b32 s5, v1
+; VERDE-NEXT:    v_readfirstlane_b32 s6, v2
+; VERDE-NEXT:    v_readfirstlane_b32 s7, v3
+; VERDE-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; VERDE-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[6:7], v[2:3]
+; VERDE-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
+; VERDE-NEXT:    s_and_saveexec_b64 s[2:3], s[2:3]
+; VERDE-NEXT:    buffer_store_dword v4, off, s[4:7], s0
+; VERDE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; VERDE-NEXT:    ; implicit-def: $vgpr4
+; VERDE-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; VERDE-NEXT:    s_cbranch_execnz .LBB24_1
+; VERDE-NEXT:  ; %bb.2:
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: raw_buffer_store_waterfall_rsrc_vgpr:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:  .LBB24_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[6:7], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
+; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], s[2:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_store_dword v4, off, s[4:7], s0
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_waterfall_rsrc_vgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:  .LBB24_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_saveexec_b32 s1, s1
+; GFX11-NEXT:    buffer_store_b32 v4, off, s[4:7], s0
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0);
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_store_waterfall_soffset_vgpr(<4 x i32> inreg %rsrc, i32 %soffset, float %val) {
+; VERDE-LABEL: raw_buffer_store_waterfall_soffset_vgpr:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    s_mov_b64 s[4:5], exec
+; VERDE-NEXT:  .LBB25_1: ; =>This Inner Loop Header: Depth=1
+; VERDE-NEXT:    v_readfirstlane_b32 s4, v0
+; VERDE-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; VERDE-NEXT:    s_and_saveexec_b64 vcc, vcc
+; VERDE-NEXT:    buffer_store_dword v1, off, s[0:3], s4
+; VERDE-NEXT:    ; implicit-def: $vgpr0
+; VERDE-NEXT:    ; implicit-def: $vgpr1
+; VERDE-NEXT:    s_xor_b64 exec, exec, vcc
+; VERDE-NEXT:    s_cbranch_execnz .LBB25_1
+; VERDE-NEXT:  ; %bb.2:
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: raw_buffer_store_waterfall_soffset_vgpr:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8-NEXT:  .LBB25_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX8-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX8-NEXT:    s_nop 2
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s4
+; GFX8-NEXT:    ; implicit-def: $vgpr0
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_waterfall_soffset_vgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-NEXT:  .LBB25_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    buffer_store_b32 v1, off, s[0:3], s4
+; GFX11-NEXT:    ; implicit-def: $vgpr0
+; GFX11-NEXT:    ; implicit-def: $vgpr1
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0);
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr(<4 x i32> %rsrc, i32 %soffset, float %val) {
+; VERDE-LABEL: raw_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    s_mov_b64 s[0:1], exec
+; VERDE-NEXT:  .LBB26_1: ; =>This Loop Header: Depth=1
+; VERDE-NEXT:    ; Child Loop BB26_2 Depth 2
+; VERDE-NEXT:    v_readfirstlane_b32 s4, v0
+; VERDE-NEXT:    v_readfirstlane_b32 s5, v1
+; VERDE-NEXT:    v_readfirstlane_b32 s6, v2
+; VERDE-NEXT:    v_readfirstlane_b32 s7, v3
+; VERDE-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; VERDE-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; VERDE-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; VERDE-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; VERDE-NEXT:    s_mov_b64 s[2:3], exec
+; VERDE-NEXT:  .LBB26_2: ; Parent Loop BB26_1 Depth=1
+; VERDE-NEXT:    ; => This Inner Loop Header: Depth=2
+; VERDE-NEXT:    v_readfirstlane_b32 s8, v4
+; VERDE-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v4
+; VERDE-NEXT:    s_and_saveexec_b64 vcc, vcc
+; VERDE-NEXT:    buffer_store_dword v5, off, s[4:7], s8
+; VERDE-NEXT:    s_xor_b64 exec, exec, vcc
+; VERDE-NEXT:    s_cbranch_execnz .LBB26_2
+; VERDE-NEXT:  ; %bb.3: ; in Loop: Header=BB26_1 Depth=1
+; VERDE-NEXT:    s_mov_b64 exec, s[2:3]
+; VERDE-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; VERDE-NEXT:    s_cbranch_execnz .LBB26_1
+; VERDE-NEXT:  ; %bb.4:
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: raw_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8-NEXT:  .LBB26_1: ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB26_2 Depth 2
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX8-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:  .LBB26_2: ; Parent Loop BB26_1 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v4
+; GFX8-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX8-NEXT:    s_nop 2
+; GFX8-NEXT:    buffer_store_dword v5, off, s[4:7], s8
+; GFX8-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_2
+; GFX8-NEXT:  ; %bb.3: ; in Loop: Header=BB26_1 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:  ; %bb.4:
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_waterfall_both_rsrc_vgpr_soffset_vgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:  .LBB26_1: ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB26_2 Depth 2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:  .LBB26_2: ; Parent Loop BB26_1 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s2, v4
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    buffer_store_b32 v5, off, s[4:7], s2
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_2
+; GFX11-NEXT:  ; %bb.3: ; in Loop: Header=BB26_1 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:  ; %bb.4:
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0);
+  ret void
+}
 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0
 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
index 8641bf1b03f36..ef2ed2e276257 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
@@ -3,8 +3,8 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=PREGFX10 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX11 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefix=GFX12 %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefix=GFX12 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
 ; PREGFX10-LABEL: tbuffer_store:
@@ -363,6 +363,317 @@ main_body:
   ret void
 }
 
+define amdgpu_ps void @raw_tbuffer_store_waterfall_soffset_vgpr(<4 x i32> inreg %rsrc, i32 %soffset, float %val) {
+; GFX10-LABEL: raw_tbuffer_store_waterfall_soffset_vgpr:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s4, exec_lo
+; GFX10-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX10-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX10-NEXT:    tbuffer_store_format_x v1, off, s[0:3], s4 format:[BUF_FMT_INVALID]
+; GFX10-NEXT:    ; implicit-def: $vgpr0
+; GFX10-NEXT:    ; implicit-def: $vgpr1
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_tbuffer_store_waterfall_soffset_vgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    tbuffer_store_format_x v1, off, s[0:3], s4 format:[BUF_FMT_INVALID]
+; GFX11-NEXT:    ; implicit-def: $vgpr0
+; GFX11-NEXT:    ; implicit-def: $vgpr1
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: raw_tbuffer_store_waterfall_soffset_vgpr:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX12-SDAG-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX12-SDAG-NEXT:    tbuffer_store_format_x v1, off, s[0:3], s4 format:[BUF_FMT_INVALID]
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr0
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr1
+; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-SDAG-NEXT:  ; %bb.2:
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: raw_tbuffer_store_waterfall_soffset_vgpr:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_cmpx_eq_u32_e64 s5, v0
+; GFX12-GISEL-NEXT:    tbuffer_store_format_x v1, off, s[0:3], s5 format:[BUF_FMT_INVALID]
+; GFX12-GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX12-GISEL-NEXT:    ; implicit-def: $vgpr1
+; GFX12-GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-GISEL-NEXT:  ; %bb.2:
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0, i32 0);
+  ret void
+}
+
+define amdgpu_ps void @raw_tbuffer_store_waterfall_rsrc_vgpr(<4 x i32> %rsrc, i32 inreg %soffset, float %val) {
+; GFX10-LABEL: raw_tbuffer_store_waterfall_rsrc_vgpr:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX10-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX10-NEXT:    s_and_saveexec_b32 s1, s1
+; GFX10-NEXT:    tbuffer_store_format_x v4, off, s[4:7], s0 format:[BUF_FMT_INVALID]
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_tbuffer_store_waterfall_rsrc_vgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_saveexec_b32 s1, s1
+; GFX11-NEXT:    tbuffer_store_format_x v4, off, s[4:7], s0 format:[BUF_FMT_INVALID]
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: raw_tbuffer_store_waterfall_rsrc_vgpr:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s1, s1
+; GFX12-SDAG-NEXT:    tbuffer_store_format_x v4, off, s[4:7], s0 format:[BUF_FMT_INVALID]
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4
+; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-SDAG-NEXT:  ; %bb.2:
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: raw_tbuffer_store_waterfall_rsrc_vgpr:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
+; GFX12-GISEL-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7]...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/95249