[llvm] [AMDGPU] Extend zero initialization of return values for TFE (PR #85759)

David Stuttard via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 19 02:46:49 PDT 2024


https://github.com/dstutt created https://github.com/llvm/llvm-project/pull/85759

buffer_load instructions that use TFE also need to zero initialize return values similar to how the image instructions currently work. Add support for this with standard zero init of all results + zero init of just TFE flag when enable-prt-strict-null subtarget feature is disabled.

>From fed91a02de5f3a0654bb8c00356dad766adeec01 Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard at amd.com>
Date: Mon, 18 Mar 2024 14:54:22 +0000
Subject: [PATCH] [AMDGPU] Extend zero initialization of return values for TFE

buffer_load instructions that use TFE also need to zero initialize return values
similar to how the image instructions currently work.
Add support for this with standard zero init of all results + zero init of just TFE flag when
enable-prt-strict-null subtarget feature is disabled.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td     |   8 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  81 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   2 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   6 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   3 +
 .../llvm.amdgcn.struct.buffer.load.format.ll  | 513 +++++++++++++++---
 ...vm.amdgcn.struct.ptr.buffer.load.format.ll | 280 ++++++++++
 7 files changed, 785 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 4ae514ffcf7850..3078e56b54b834 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -119,6 +119,7 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
+  let hasPostISelHook = 1;
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
@@ -323,6 +324,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   Instruction BaseOpcode = !cast<Instruction>(MUBUFGetBaseOpcode<NAME>.ret);
   let MUBUF = 1;
   let AsmMatchConverter = "cvtMubuf";
+  let hasPostISelHook = 1;
 }
 
 class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
@@ -330,6 +332,7 @@ class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
+  let hasPostISelHook = 1;
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
@@ -390,6 +393,7 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
   let has_slc     = 0;
   let has_sccb    = 0;
   let sccb_value  = 0;
+  let tfe         = 0;
 }
 
 class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> {
@@ -675,6 +679,7 @@ class MUBUF_Pseudo_Store_Lds<string opName>
   let has_vaddr = 0;
   let lds = 1;
   let VALU = 1;
+  let tfe = 0;
 
   let Uses = [EXEC, M0];
   let AsmMatchConverter = "cvtMubuf";
@@ -733,6 +738,7 @@ class MUBUF_Atomic_Pseudo<string opName,
   let has_glc = 0;
   let has_dlc = 0;
   let has_sccb = 1;
+  let tfe = 0;
   let AsmMatchConverter = "cvtMubufAtomic";
 }
 
@@ -3369,7 +3375,7 @@ def MUBUFInfoTable : GenericTable {
   let CppTypeName = "MUBUFInfo";
   let Fields = [
     "Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset",
-    "IsBufferInv"
+    "IsBufferInv", "tfe"
   ];
 
   let PrimaryKey = ["Opcode"];
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5ccf21f76015de..c1cfa39368f479 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15033,57 +15033,63 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 // result register that will be written in the case of a memory access failure.
 // The required code is also added to tie this init code to the result of the
 // img instruction.
-void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
+void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   MachineBasicBlock &MBB = *MI.getParent();
 
-  MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
-  MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
-  MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+  int DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+  unsigned InitIdx = 0;
 
-  if (!TFE && !LWE) // intersect_ray
-    return;
-
-  unsigned TFEVal = TFE ? TFE->getImm() : 0;
-  unsigned LWEVal = LWE ? LWE->getImm() : 0;
-  unsigned D16Val = D16 ? D16->getImm() : 0;
+  if (TII->isImage(MI)) {
+    MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+    MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+    MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
 
-  if (!TFEVal && !LWEVal)
-    return;
+    if (!TFE && !LWE) // intersect_ray
+      return;
 
-  // At least one of TFE or LWE are non-zero
-  // We have to insert a suitable initialization of the result value and
-  // tie this to the dest of the image instruction.
+    unsigned TFEVal = TFE ? TFE->getImm() : 0;
+    unsigned LWEVal = LWE ? LWE->getImm() : 0;
+    unsigned D16Val = D16 ? D16->getImm() : 0;
 
-  const DebugLoc &DL = MI.getDebugLoc();
+    if (!TFEVal && !LWEVal)
+      return;
 
-  int DstIdx =
-      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+    // At least one of TFE or LWE are non-zero
+    // We have to insert a suitable initialization of the result value and
+    // tie this to the dest of the image instruction.
 
-  // Calculate which dword we have to initialize to 0.
-  MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+    // Calculate which dword we have to initialize to 0.
+    MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
 
-  // check that dmask operand is found.
-  assert(MO_Dmask && "Expected dmask operand in instruction");
+    // check that dmask operand is found.
+    assert(MO_Dmask && "Expected dmask operand in instruction");
 
-  unsigned dmask = MO_Dmask->getImm();
-  // Determine the number of active lanes taking into account the
-  // Gather4 special case
-  unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
+    unsigned dmask = MO_Dmask->getImm();
+    // Determine the number of active lanes taking into account the
+    // Gather4 special case
+    unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
 
-  bool Packed = !Subtarget->hasUnpackedD16VMem();
+    bool Packed = !Subtarget->hasUnpackedD16VMem();
 
-  unsigned InitIdx =
-      D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
+    InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
 
-  // Abandon attempt if the dst size isn't large enough
-  // - this is in fact an error but this is picked up elsewhere and
-  // reported correctly.
-  uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
-  if (DstSize < InitIdx)
+    // Abandon attempt if the dst size isn't large enough
+    // - this is in fact an error but this is picked up elsewhere and
+    // reported correctly.
+    uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    if (DstSize < InitIdx)
+      return;
+  } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFHasTFE(MI.getOpcode())) {
+    uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    InitIdx = DstSize;
+  } else {
     return;
+  }
+
+  const DebugLoc &DL = MI.getDebugLoc();
 
   // Create a register for the initialization value.
   Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
@@ -15184,11 +15190,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     return;
   }
 
-  if (TII->isImage(MI)) {
+  if (TII->isImage(MI) || TII->isMUBUF(MI)) {
     if (!MI.mayStore())
-      AddIMGInit(MI);
-    TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
+      AddMemOpInit(MI);
   }
+  if (TII->isImage(MI))
+    TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
 }
 
 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 89da4428e3ab0a..9856a2923d38f7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -466,7 +466,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
-  void AddIMGInit(MachineInstr &MI) const;
+  void AddMemOpInit(MachineInstr &MI) const;
   void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                      SDNode *Node) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 6d53f68ace70df..5e89b3fdaf032d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -318,6 +318,7 @@ struct MUBUFInfo {
   bool has_srsrc;
   bool has_soffset;
   bool IsBufferInv;
+  bool tfe;
 };
 
 struct MTBUFInfo {
@@ -466,6 +467,11 @@ bool getMUBUFIsBufferInv(unsigned Opc) {
   return Info ? Info->IsBufferInv : false;
 }
 
+bool getMUBUFHasTFE(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->tfe : false;
+}
+
 bool getSMEMIsBuffer(unsigned Opc) {
   const SMInfo *Info = getSMEMOpcodeHelper(Opc);
   return Info ? Info->IsBuffer : false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 29ac402d953513..a8a4736b8538c1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -525,6 +525,9 @@ bool getMUBUFHasSoffset(unsigned Opc);
 LLVM_READONLY
 bool getMUBUFIsBufferInv(unsigned Opc);
 
+LLVM_READONLY
+bool getMUBUFHasTFE(unsigned Opc);
+
 LLVM_READONLY
 bool getSMEMIsBuffer(unsigned Opc);
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
index 00be32b06de058..286545d479d2ec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
@@ -2,6 +2,7 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-enable-prt-strict-null -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
 ;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
 
@@ -34,6 +35,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v8, 0
+; NOPRT-NEXT:    s_clause 0x2
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen
+; NOPRT-NEXT:    buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc
+; NOPRT-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
@@ -75,6 +86,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_immoffs:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_immoffs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -146,6 +164,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
 ; GFX11-NEXT:    v_add_f32_e32 v2, v10, v2
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_immoffs_large:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v8, 0
+; NOPRT-NEXT:    s_movk_i32 s4, 0x7ffc
+; NOPRT-NEXT:    s_clause 0x1
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092
+; NOPRT-NEXT:    buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092
+; NOPRT-NEXT:    s_mov_b32 s4, 0x8ffc
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    v_add_f32_e32 v1, v1, v5
+; NOPRT-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4
+; NOPRT-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1
+; NOPRT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; NOPRT-NEXT:    v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3
+; NOPRT-NEXT:    v_add_f32_e32 v2, v10, v2
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_immoffs_large:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
@@ -196,6 +233,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_12bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_voffset_large_12bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -235,6 +279,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_13bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_voffset_large_13bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -274,6 +327,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_16bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_voffset_large_16bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -313,6 +375,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_23bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_voffset_large_23bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -352,6 +423,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_24bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: buffer_load_voffset_large_24bit:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0
@@ -389,6 +469,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_idx:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_idx:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen
@@ -427,6 +513,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_ofs:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_ofs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
@@ -466,6 +561,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_ofs_imm:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_ofs_imm:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
@@ -497,6 +601,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_both:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_both:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen
@@ -529,6 +639,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_both_reversed:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v2, v0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_both_reversed:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v0
@@ -562,6 +679,13 @@ define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_x:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_x:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -595,6 +719,13 @@ define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_x_i32:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_x_i32:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -629,6 +760,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_xy:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_xy:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -644,7 +782,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX6-LABEL: buffer_load_v4i32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
-; GFX6-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
+; GFX6-NEXT:    v_mov_b32_e32 v7, 2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v2
+; GFX6-NEXT:    buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -658,7 +801,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX8PLUS-LABEL: buffer_load_v4i32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8PLUS-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v7, 2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8PLUS-NEXT:    buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v0, v6
@@ -667,22 +815,49 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ;
 ; GFX11-LABEL: buffer_load_v4i32_tfe:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v2
+; GFX11-NEXT:    buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: buffer_load_v4i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
-; GFX12-NEXT:    v_mov_b32_e32 v0, v6
-; GFX12-NEXT:    ; return to shader part epilog
-  %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+; NOPRT-LABEL: buffer_load_v4i32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v2, 2
+; NOPRT-NEXT:    v_mov_b32_e32 v6, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v6
+; NOPRT-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: buffer_load_v4i32_tfe:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2
+; GFX12-SDAG-NEXT:    buffer_load_format_xyzw v[2:6], v7, s[0:3], null idxen tfe
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v6
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: buffer_load_v4i32_tfe:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 2
+; GFX12-GISEL-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, v6
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
+  %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 2, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x i32>, i32 } %load, 0
   store <4 x i32> %data, ptr addrspace(1) %out
   %status = extractvalue { <4 x i32>, i32 } %load, 1
@@ -694,6 +869,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX6-LABEL: buffer_load_v4f32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX6-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -708,6 +887,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX8PLUS-LABEL: buffer_load_v4f32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -718,20 +901,46 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX11-LABEL: buffer_load_v4f32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX11-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: buffer_load_v4f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
-; GFX12-NEXT:    v_mov_b32_e32 v0, v6
-; GFX12-NEXT:    ; return to shader part epilog
+; NOPRT-LABEL: buffer_load_v4f32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v6, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v6
+; NOPRT-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: buffer_load_v4f32_tfe:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2
+; GFX12-SDAG-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v6
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: buffer_load_v4f32_tfe:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, v6
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x float>, i32 } %load, 0
   store <4 x float> %data, ptr addrspace(1) %out
@@ -744,6 +953,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX6-LABEL: buffer_load_v3i32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX6-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -759,6 +971,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX8PLUS-LABEL: buffer_load_v3i32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx3 v[0:1], v[2:4]
@@ -769,20 +984,45 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX11-LABEL: buffer_load_v3i32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[2:4], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: buffer_load_v3i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b96 v[0:1], v[2:4], off
-; GFX12-NEXT:    v_mov_b32_e32 v0, v5
-; GFX12-NEXT:    ; return to shader part epilog
+; NOPRT-LABEL: buffer_load_v3i32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
+; NOPRT-NEXT:    buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v5
+; NOPRT-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: buffer_load_v3i32_tfe:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v5, v2
+; GFX12-SDAG-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v5
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: buffer_load_v3i32_tfe:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x i32>, i32 } %load, 0
   store <3 x i32> %data, ptr addrspace(1) %out
@@ -795,6 +1035,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX6-LABEL: buffer_load_v3f32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX6-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -810,6 +1053,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX8PLUS-LABEL: buffer_load_v3f32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx3 v[0:1], v[2:4]
@@ -820,20 +1066,45 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX11-LABEL: buffer_load_v3f32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[2:4], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: buffer_load_v3f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b96 v[0:1], v[2:4], off
-; GFX12-NEXT:    v_mov_b32_e32 v0, v5
-; GFX12-NEXT:    ; return to shader part epilog
+; NOPRT-LABEL: buffer_load_v3f32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
+; NOPRT-NEXT:    buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v5
+; NOPRT-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: buffer_load_v3f32_tfe:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v5, v2
+; GFX12-SDAG-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v5
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: buffer_load_v3f32_tfe:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x float>, i32 } %load, 0
   store <3 x float> %data, ptr addrspace(1) %out
@@ -846,6 +1117,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX6-LABEL: buffer_load_v2i32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX6-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -860,6 +1134,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX8PLUS-LABEL: buffer_load_v2i32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
@@ -870,20 +1146,43 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX11-LABEL: buffer_load_v2i32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX11-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: buffer_load_v2i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
-; GFX12-NEXT:    ; return to shader part epilog
+; NOPRT-LABEL: buffer_load_v2i32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
+; NOPRT-NEXT:    buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v4
+; NOPRT-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: buffer_load_v2i32_tfe:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-SDAG-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: buffer_load_v2i32_tfe:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x i32>, i32 } %load, 0
   store <2 x i32> %data, ptr addrspace(1) %out
@@ -896,6 +1195,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX6-LABEL: buffer_load_v2f32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX6-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -910,6 +1212,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX8PLUS-LABEL: buffer_load_v2f32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
@@ -920,20 +1224,43 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX11-LABEL: buffer_load_v2f32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX11-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: buffer_load_v2f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
-; GFX12-NEXT:    ; return to shader part epilog
+; NOPRT-LABEL: buffer_load_v2f32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
+; NOPRT-NEXT:    buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v4
+; NOPRT-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: buffer_load_v2f32_tfe:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-SDAG-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: buffer_load_v2f32_tfe:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x float>, i32 } %load, 0
   store <2 x float> %data, ptr addrspace(1) %out
@@ -946,6 +1273,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
 ; GFX6-LABEL: buffer_load_i32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX6-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -960,6 +1288,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
 ; GFX8PLUS-LABEL: buffer_load_i32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dword v[0:1], v2
@@ -970,20 +1299,42 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
 ; GFX11-LABEL: buffer_load_i32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: buffer_load_i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
-; GFX12-NEXT:    ; return to shader part epilog
+; NOPRT-LABEL: buffer_load_i32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v3, 0
+; NOPRT-NEXT:    buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b32 v[0:1], v2, off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v3
+; NOPRT-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: buffer_load_i32_tfe:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-SDAG-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: buffer_load_i32_tfe:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i32, i32 } %load, 0
   store i32 %data, ptr addrspace(1) %out
@@ -996,6 +1347,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
 ; GFX6-LABEL: buffer_load_f32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX6-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -1010,6 +1362,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
 ; GFX8PLUS-LABEL: buffer_load_f32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dword v[0:1], v2
@@ -1020,20 +1373,42 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
 ; GFX11-LABEL: buffer_load_f32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: buffer_load_f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
-; GFX12-NEXT:    ; return to shader part epilog
+; NOPRT-LABEL: buffer_load_f32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v3, 0
+; NOPRT-NEXT:    buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b32 v[0:1], v2, off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v3
+; NOPRT-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: buffer_load_f32_tfe:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-SDAG-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: buffer_load_f32_tfe:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %load = call { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { float, i32 } %load, 0
   store float %data, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
index b0bd4e428ef2dd..c5202b84fa1efe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
@@ -2,6 +2,7 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mattr=-enable-prt-strict-null -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s
 
 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
 ; GFX6-LABEL: buffer_load:
@@ -31,6 +32,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrsp
 ; GFX11-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v8, 0
+; NOPRT-NEXT:    s_clause 0x2
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen
+; NOPRT-NEXT:    buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc
+; NOPRT-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
   %data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -62,6 +73,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_immoffs:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0)
   ret <4 x float> %data
@@ -126,6 +144,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg)
 ; GFX11-NEXT:    v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3
 ; GFX11-NEXT:    v_add_f32_e32 v2, v10, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_immoffs_large:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v8, 0
+; NOPRT-NEXT:    s_movk_i32 s4, 0x7ffc
+; NOPRT-NEXT:    s_clause 0x1
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092
+; NOPRT-NEXT:    buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092
+; NOPRT-NEXT:    s_mov_b32 s4, 0x8ffc
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    v_add_f32_e32 v1, v1, v5
+; NOPRT-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4
+; NOPRT-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1
+; NOPRT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; NOPRT-NEXT:    v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3
+; NOPRT-NEXT:    v_add_f32_e32 v2, v10, v2
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %d.0 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 60, i32 0)
   %d.1 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 32764, i32 0)
@@ -156,6 +193,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) i
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_12bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 0, i32 0)
   ret <4 x float> %data
@@ -188,6 +232,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(ptr addrspace(8) i
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_13bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8188, i32 0, i32 0)
   ret <4 x float> %data
@@ -220,6 +273,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(ptr addrspace(8) i
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_16bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 65532, i32 0, i32 0)
   ret <4 x float> %data
@@ -252,6 +314,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(ptr addrspace(8) i
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_23bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8388604, i32 0, i32 0)
   ret <4 x float> %data
@@ -284,6 +355,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(ptr addrspace(8) i
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_24bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 16777212, i32 0, i32 0)
   ret <4 x float> %data
@@ -307,6 +387,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) {
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_idx:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0)
   ret <4 x float> %data
@@ -339,6 +425,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) {
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_ofs:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
@@ -371,6 +466,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) {
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_ofs_imm:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %ofs = add i32 %1, 60
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0)
@@ -395,6 +499,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32)
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_both:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0)
   ret <4 x float> %data
@@ -421,6 +531,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg,
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_both_reversed:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v2, v0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
@@ -447,6 +564,13 @@ define amdgpu_ps float @buffer_load_x(ptr addrspace(8) inreg %rsrc) {
 ; GFX11-NEXT:    buffer_load_format_x v0, v0, s[0:3], 0 idxen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_x:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   ret float %data
@@ -473,6 +597,13 @@ define amdgpu_ps float @buffer_load_x_i32(ptr addrspace(8) inreg %rsrc) {
 ; GFX11-NEXT:    buffer_load_format_x v0, v0, s[0:3], 0 idxen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_x_i32:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call i32 @llvm.amdgcn.struct.ptr.buffer.load.format.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %fdata = bitcast i32 %data to float
@@ -500,6 +631,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) {
 ; GFX11-NEXT:    buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_xy:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   ret <2 x float> %data
@@ -509,6 +647,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX6-LABEL: buffer_load_v4i32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX6-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -523,6 +665,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX8PLUS-LABEL: buffer_load_v4i32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -533,11 +679,25 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX11-LABEL: buffer_load_v4i32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX11-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v4i32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v6, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v6
+; NOPRT-NEXT:    ; return to shader part epilog
   %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x i32>, i32 } %load, 0
   store <4 x i32> %data, ptr addrspace(1) %out
@@ -550,6 +710,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX6-LABEL: buffer_load_v4f32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX6-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -564,6 +728,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX8PLUS-LABEL: buffer_load_v4f32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -574,11 +742,25 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX11-LABEL: buffer_load_v4f32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX11-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v4f32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v6, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v6
+; NOPRT-NEXT:    ; return to shader part epilog
   %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x float>, i32 } %load, 0
   store <4 x float> %data, ptr addrspace(1) %out
@@ -591,6 +773,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX6-LABEL: buffer_load_v3i32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX6-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -606,6 +791,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX8PLUS-LABEL: buffer_load_v3i32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx3 v[0:1], v[2:4]
@@ -616,11 +804,24 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX11-LABEL: buffer_load_v3i32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[2:4], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v3i32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
+; NOPRT-NEXT:    buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v5
+; NOPRT-NEXT:    ; return to shader part epilog
   %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x i32>, i32 } %load, 0
   store <3 x i32> %data, ptr addrspace(1) %out
@@ -633,6 +834,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX6-LABEL: buffer_load_v3f32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX6-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -648,6 +852,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX8PLUS-LABEL: buffer_load_v3f32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx3 v[0:1], v[2:4]
@@ -658,11 +865,24 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX11-LABEL: buffer_load_v3f32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[2:4], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v3f32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
+; NOPRT-NEXT:    buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v5
+; NOPRT-NEXT:    ; return to shader part epilog
   %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x float>, i32 } %load, 0
   store <3 x float> %data, ptr addrspace(1) %out
@@ -675,6 +895,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX6-LABEL: buffer_load_v2i32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX6-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -689,6 +912,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX8PLUS-LABEL: buffer_load_v2i32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
@@ -699,11 +924,23 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX11-LABEL: buffer_load_v2i32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX11-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v2i32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
+; NOPRT-NEXT:    buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v4
+; NOPRT-NEXT:    ; return to shader part epilog
   %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x i32>, i32 } %load, 0
   store <2 x i32> %data, ptr addrspace(1) %out
@@ -716,6 +953,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX6-LABEL: buffer_load_v2f32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX6-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -730,6 +970,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX8PLUS-LABEL: buffer_load_v2f32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
@@ -740,11 +982,23 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
 ; GFX11-LABEL: buffer_load_v2f32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX11-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v2f32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
+; NOPRT-NEXT:    buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v4
+; NOPRT-NEXT:    ; return to shader part epilog
   %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x float>, i32 } %load, 0
   store <2 x float> %data, ptr addrspace(1) %out
@@ -757,6 +1011,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
 ; GFX6-LABEL: buffer_load_i32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX6-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -771,6 +1026,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
 ; GFX8PLUS-LABEL: buffer_load_i32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dword v[0:1], v2
@@ -781,11 +1037,22 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
 ; GFX11-LABEL: buffer_load_i32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_i32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v3, 0
+; NOPRT-NEXT:    buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b32 v[0:1], v2, off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v3
+; NOPRT-NEXT:    ; return to shader part epilog
   %load = call { i32, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i32, i32 } %load, 0
   store i32 %data, ptr addrspace(1) %out
@@ -798,6 +1065,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
 ; GFX6-LABEL: buffer_load_f32_tfe:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX6-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -812,6 +1080,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
 ; GFX8PLUS-LABEL: buffer_load_f32_tfe:
 ; GFX8PLUS:       ; %bb.0:
 ; GFX8PLUS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8PLUS-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX8PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8PLUS-NEXT:    flat_store_dword v[0:1], v2
@@ -822,11 +1091,22 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
 ; GFX11-LABEL: buffer_load_f32_tfe:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_f32_tfe:
+; NOPRT:       ; %bb.0:
+; NOPRT-NEXT:    v_mov_b32_e32 v3, 0
+; NOPRT-NEXT:    buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    global_store_b32 v[0:1], v2, off
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v3
+; NOPRT-NEXT:    ; return to shader part epilog
   %load = call { float, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { float, i32 } %load, 0
   store float %data, ptr addrspace(1) %out



More information about the llvm-commits mailing list