[llvm] [SelectionDAG] Detect impossible conditions using known bits analysis (PR #150715)

via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 25 15:51:39 PDT 2025


https://github.com/AZero13 created https://github.com/llvm/llvm-project/pull/150715

None

>From 2573dfba759b15cb0fee3242514489975767db69 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Fri, 25 Jul 2025 18:51:24 -0400
Subject: [PATCH] [SelectionDAG] Detect impossible conditions using known bits
 analysis

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   52 +
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       |   20 +-
 .../CodeGen/AMDGPU/frame-index-elimination.ll | 2105 +++++++++++++++++
 3 files changed, 2161 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0e8e4c9618bb2..ca29e6fe1fb40 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13511,6 +13511,58 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   SDLoc DL(N);
 
+  // Detect impossible conditions using known bits analysis.
+  if (N1.getOpcode() == ISD::Constant) {
+    ConstantSDNode *N1C = cast<ConstantSDNode>(N1);
+    APInt C1 = N1C->getAPIntValue();
+    KnownBits KnownRHS = KnownBits::makeConstant(C1);
+    
+    // Bail out early if RHS is unknown (shouldn't happen for constants)
+    if (KnownRHS.isUnknown())
+      return SDValue();
+    
+    std::optional<bool> KnownVal;
+    
+    // Handle special cases first (like GlobalISel does)
+    if (KnownRHS.isZero()) {
+      // x >=u 0 -> always true
+      // x <u 0 -> always false  
+      if (Cond == ISD::SETUGE)
+        KnownVal = true;
+      else if (Cond == ISD::SETULT)
+        KnownVal = false;
+    }
+    
+    // If not handled by special cases, use ICmpInst::compare
+    if (!KnownVal) {
+      KnownBits KnownLHS = DAG.computeKnownBits(N0);
+      
+      // Convert ISD::CondCode to CmpInst::Predicate
+      CmpInst::Predicate Pred;
+      switch (Cond) {
+      case ISD::SETEQ:  Pred = CmpInst::ICMP_EQ; break;
+      case ISD::SETNE:  Pred = CmpInst::ICMP_NE; break;
+      case ISD::SETULT: Pred = CmpInst::ICMP_ULT; break;
+      case ISD::SETULE: Pred = CmpInst::ICMP_ULE; break;
+      case ISD::SETUGT: Pred = CmpInst::ICMP_UGT; break;
+      case ISD::SETUGE: Pred = CmpInst::ICMP_UGE; break;
+      case ISD::SETLT:  Pred = CmpInst::ICMP_SLT; break;
+      case ISD::SETLE:  Pred = CmpInst::ICMP_SLE; break;
+      case ISD::SETGT:  Pred = CmpInst::ICMP_SGT; break;
+      case ISD::SETGE:  Pred = CmpInst::ICMP_SGE; break;
+      default: 
+        return SDValue(); // Unsupported predicate
+      }
+      
+      // Use the same logic as GlobalISel: ICmpInst::compare
+      KnownVal = ICmpInst::compare(KnownLHS, KnownRHS, Pred);
+    }
+    
+    // If the comparison result is known, replace with constant
+    if (KnownVal)
+      return DAG.getConstant(*KnownVal ? 1 : 0, DL, VT);
+  }
+
   if (SDValue Combined = SimplifySetCC(VT, N0, N1, Cond, DL, !PreferSetCC)) {
     // If we prefer to have a setcc, and we don't, we'll try our best to
     // recreate one using rebuildSetCC.
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index 06e957fdcc6a2..9b22abcc94d3b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -597,22 +597,10 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 }
 
 define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
-; SDISEL-LABEL: select_noccmp1:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    cmp x0, #0
-; SDISEL-NEXT:    ccmp x0, #13, #4, lt
-; SDISEL-NEXT:    cset w8, gt
-; SDISEL-NEXT:    cmp x2, #2
-; SDISEL-NEXT:    ccmp x2, #4, #4, lt
-; SDISEL-NEXT:    csinc w8, w8, wzr, le
-; SDISEL-NEXT:    cmp w8, #0
-; SDISEL-NEXT:    csel x0, xzr, x3, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: select_noccmp1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x0, x3
-; GISEL-NEXT:    ret
+; CHECK-LABEL: select_noccmp1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov x0, x3
+; CHECK-NEXT:    ret
   %c0 = icmp slt i64 %v1, 0
   %c1 = icmp sgt i64 %v1, 13
   %c2 = icmp slt i64 %v3, 2
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 40cff44d6d3e6..81868b6e01e74 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
@@ -21,6 +22,46 @@
 
 ; GCN: ds_write_b32 v0, v0
 define void @func_mov_fi_i32() #0 {
+; CI-LABEL: func_mov_fi_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_mov_fi_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_mov_fi_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: func_mov_fi_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s32
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: func_mov_fi_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s32
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile ptr addrspace(5) %alloca, ptr addrspace(3) poison
   ret void
@@ -46,6 +87,61 @@ define void @func_mov_fi_i32() #0 {
 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, [[ADD]]
 ; GFX9-NEXT:         ds_write_b32 v0, v0
 define void @func_mov_fi_i32_offset() #0 {
+; CI-LABEL: func_mov_fi_i32_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_mov_fi_i32_offset:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_mov_fi_i32_offset:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s32, 4
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: func_mov_fi_i32_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_add_i32 s0, s32, 4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: func_mov_fi_i32_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_add_i32 s0, s32, 4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s0
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca i32, addrspace(5)
   %alloca1 = alloca i32, addrspace(5)
   store volatile ptr addrspace(5) %alloca0, ptr addrspace(3) poison
@@ -71,6 +167,48 @@ define void @func_mov_fi_i32_offset() #0 {
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
 define void @func_add_constant_to_fi_i32() #0 {
+; CI-LABEL: func_add_constant_to_fi_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v1, s32, 6
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 4, v1
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_add_constant_to_fi_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v1
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_add_constant_to_fi_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_add_u32_e64 v0, 4, s32
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: func_add_constant_to_fi_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e64 v0, 4, s32
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: func_add_constant_to_fi_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e64 v0, 4, s32
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [2 x i32], align 4, addrspace(5)
   %gep0 = getelementptr inbounds [2 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
   store volatile ptr addrspace(5) %gep0, ptr addrspace(3) poison
@@ -93,6 +231,55 @@ define void @func_add_constant_to_fi_i32() #0 {
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
 define void @func_other_fi_user_i32() #0 {
+; CI-LABEL: func_other_fi_user_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_lshr_b32 s5, s32, 6
+; CI-NEXT:    s_mul_i32 s4, s5, 9
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_lshr_b32 s5, s32, 6
+; GFX9-MUBUF-NEXT:    s_mul_i32 s4, s5, 9
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s32, 9
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: func_other_fi_user_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mul_i32 s0, s32, 9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: func_other_fi_user_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s32, 9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [2 x i32], align 4, addrspace(5)
   %ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
   %mul = mul i32 %ptrtoint, 9
@@ -105,6 +292,45 @@ define void @func_other_fi_user_i32() #0 {
 ; MUBUF:        buffer_store_dword v1, v0, s[0:3], 0 offen{{$}}
 ; GFX9-FLATSCR: scratch_store_dword v0, v1, off{{$}}
 define void @func_store_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
+; CI-LABEL: func_store_private_arg_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v1, 15
+; CI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_store_private_arg_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_store_private_arg_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-FLATSCR-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: func_store_private_arg_i32_ptr:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-TRUE16-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: func_store_private_arg_i32_ptr:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-FAKE16-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store volatile i32 15, ptr addrspace(5) %ptr
   ret void
 }
@@ -114,6 +340,40 @@ define void @func_store_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
 ; MUBUF-NEXT:        buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}}
 ; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}}
 define void @func_load_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
+; CI-LABEL: func_load_private_arg_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_load_private_arg_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_load_private_arg_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: func_load_private_arg_i32_ptr:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: func_load_private_arg_i32_ptr:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i32, ptr addrspace(5) %ptr
   ret void
 }
@@ -132,6 +392,48 @@ define void @func_load_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
 define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v1, s32, 6
+; CI-NEXT:    v_or_b32_e32 v0, 4, v1
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX9-MUBUF-NEXT:    v_or_b32_e32 v0, 4, v1
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_or_b32_e64 v0, s32, 4
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_or_b32_e64 v0, s32, 4
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_or_b32_e64 v0, s32, 4
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
   %load1 = load i32, ptr addrspace(5) %gep1
@@ -146,6 +448,68 @@ define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32
 ; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32
 ; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4
 define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    ds_write_b8 v0, v0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b32 v0, v1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    ds_write_b8 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v1
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_load_ubyte v0, off, s32
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v1, off, s32 offset:4
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    ds_write_b8 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v1
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    ds_store_b8 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v0, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    ds_store_b8 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
   %load0 = load i8, ptr addrspace(5) %gep0
@@ -173,6 +537,88 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8
 
 ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CI-NEXT:    s_cbranch_execz .LBB8_2
+; CI-NEXT:  ; %bb.1: ; %bb
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v1, s32, 6
+; CI-NEXT:    v_add_i32_e64 v0, s[6:7], 4, v1
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:  .LBB8_2: ; %ret
+; CI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v1
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:  .LBB8_2: ; %ret
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_add_u32_e64 v0, 4, s32
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:  .LBB8_2: ; %ret
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB8_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %bb
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e64 v0, 4, s32
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:  .LBB8_2: ; %ret
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB8_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %bb
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e64 v0, 4, s32
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:  .LBB8_2: ; %ret
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %bb, label %ret
 
@@ -202,6 +648,73 @@ ret:
 
 ; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
+; CI-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_lshr_b32 s5, s32, 6
+; CI-NEXT:    s_addk_i32 s5, 0x200
+; CI-NEXT:    v_mov_b32_e32 v0, 7
+; CI-NEXT:    s_mul_i32 s4, s5, 9
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_lshr_b32 s5, s32, 6
+; GFX9-MUBUF-NEXT:    s_addk_i32 s5, 0x200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-MUBUF-NEXT:    s_mul_i32 s4, s5, 9
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_add_i32 s1, s32, 0x200
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s1, 9
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:260
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_add_i32 s1, s32, 0x200
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_mul_i32 s0, s1, 9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:260 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_add_i32 s1, s32, 0x200
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s1, 9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, s0
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:260 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
   %gep0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca0, i32 0, i32 65
@@ -225,6 +738,103 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
 
 ; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 {
+; CI-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_lshr_b32 s5, s32, 6
+; CI-NEXT:    s_addk_i32 s5, 0x200
+; CI-NEXT:    v_mov_b32_e32 v0, 7
+; CI-NEXT:    s_mul_i32 s4, s5, 9
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; def vcc
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use vcc
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_lshr_b32 s5, s32, 6
+; GFX9-MUBUF-NEXT:    s_addk_i32 s5, 0x200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-MUBUF-NEXT:    s_mul_i32 s4, s5, 9
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; def vcc
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use vcc
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_add_i32 s1, s32, 0x200
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s1, 9
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; def vcc
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:260
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use vcc
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_add_i32 s1, s32, 0x200
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; def vcc
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    s_mul_i32 s0, s1, 9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:260 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; use vcc
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_add_i32 s1, s32, 0x200
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ; def vcc
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s1, 9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, s0
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:260 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ; use vcc
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
   %vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
@@ -253,6 +863,486 @@ declare void @func(ptr addrspace(5) nocapture) #0
 ; FLATSCR: scratch_store_dword v0, off, s33 offset:
 ; FLATSCR: scratch_store_dword v{{[0-9]+}}, off, s33 offset:
 define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
+; CI-LABEL: undefined_stack_store_reg:
+; CI:       ; %bb.0: ; %bb
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s16, s33
+; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CI-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; CI-NEXT:    s_mov_b64 exec, s[18:19]
+; CI-NEXT:    v_writelane_b32 v42, s16, 18
+; CI-NEXT:    v_writelane_b32 v42, s30, 0
+; CI-NEXT:    v_writelane_b32 v42, s31, 1
+; CI-NEXT:    v_writelane_b32 v42, s34, 2
+; CI-NEXT:    v_writelane_b32 v42, s35, 3
+; CI-NEXT:    v_writelane_b32 v42, s36, 4
+; CI-NEXT:    v_writelane_b32 v42, s37, 5
+; CI-NEXT:    v_writelane_b32 v42, s38, 6
+; CI-NEXT:    v_writelane_b32 v42, s39, 7
+; CI-NEXT:    v_writelane_b32 v42, s48, 8
+; CI-NEXT:    v_writelane_b32 v42, s49, 9
+; CI-NEXT:    v_writelane_b32 v42, s50, 10
+; CI-NEXT:    v_writelane_b32 v42, s51, 11
+; CI-NEXT:    v_writelane_b32 v42, s52, 12
+; CI-NEXT:    v_writelane_b32 v42, s53, 13
+; CI-NEXT:    v_writelane_b32 v42, s54, 14
+; CI-NEXT:    v_writelane_b32 v42, s55, 15
+; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    v_writelane_b32 v42, s64, 16
+; CI-NEXT:    v_mov_b32_e32 v40, v0
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    s_addk_i32 s32, 0xc00
+; CI-NEXT:    v_writelane_b32 v42, s65, 17
+; CI-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; CI-NEXT:    s_and_saveexec_b64 s[54:55], vcc
+; CI-NEXT:    s_cbranch_execz .LBB11_2
+; CI-NEXT:  ; %bb.1: ; %bb4
+; CI-NEXT:    s_getpc_b64 s[16:17]
+; CI-NEXT:    s_add_u32 s16, s16, func at gotpcrel32@lo+4
+; CI-NEXT:    s_addc_u32 s17, s17, func at gotpcrel32@hi+12
+; CI-NEXT:    s_load_dwordx2 s[64:65], s[16:17], 0x0
+; CI-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; CI-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; CI-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; CI-NEXT:    s_mov_b64 s[48:49], s[10:11]
+; CI-NEXT:    s_mov_b32 s50, s12
+; CI-NEXT:    s_mov_b32 s51, s13
+; CI-NEXT:    s_mov_b32 s52, s14
+; CI-NEXT:    s_mov_b32 s53, s15
+; CI-NEXT:    v_mov_b32_e32 v41, v31
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:28
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:24
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20
+; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16
+; CI-NEXT:    v_lshr_b32_e64 v0, s33, 6
+; CI-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; CI-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; CI-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CI-NEXT:    s_mov_b64 s[10:11], s[48:49]
+; CI-NEXT:    s_mov_b32 s12, s50
+; CI-NEXT:    s_mov_b32 s13, s51
+; CI-NEXT:    s_mov_b32 s14, s52
+; CI-NEXT:    s_mov_b32 s15, s53
+; CI-NEXT:    v_mov_b32_e32 v31, v41
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; CI-NEXT:  .LBB11_2: ; %bb5
+; CI-NEXT:    s_or_b64 exec, exec, s[54:55]
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CI-NEXT:    v_readlane_b32 s65, v42, 17
+; CI-NEXT:    v_readlane_b32 s64, v42, 16
+; CI-NEXT:    v_readlane_b32 s55, v42, 15
+; CI-NEXT:    v_readlane_b32 s54, v42, 14
+; CI-NEXT:    v_readlane_b32 s53, v42, 13
+; CI-NEXT:    v_readlane_b32 s52, v42, 12
+; CI-NEXT:    v_readlane_b32 s51, v42, 11
+; CI-NEXT:    v_readlane_b32 s50, v42, 10
+; CI-NEXT:    v_readlane_b32 s49, v42, 9
+; CI-NEXT:    v_readlane_b32 s48, v42, 8
+; CI-NEXT:    v_readlane_b32 s39, v42, 7
+; CI-NEXT:    v_readlane_b32 s38, v42, 6
+; CI-NEXT:    v_readlane_b32 s37, v42, 5
+; CI-NEXT:    v_readlane_b32 s36, v42, 4
+; CI-NEXT:    v_readlane_b32 s35, v42, 3
+; CI-NEXT:    v_readlane_b32 s34, v42, 2
+; CI-NEXT:    v_readlane_b32 s31, v42, 1
+; CI-NEXT:    v_readlane_b32 s30, v42, 0
+; CI-NEXT:    s_mov_b32 s32, s33
+; CI-NEXT:    v_readlane_b32 s4, v42, 18
+; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CI-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; CI-NEXT:    s_mov_b64 exec, s[6:7]
+; CI-NEXT:    s_mov_b32 s33, s4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: undefined_stack_store_reg:
+; GFX9-MUBUF:       ; %bb.0: ; %bb
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_mov_b32 s16, s33
+; GFX9-MUBUF-NEXT:    s_mov_b32 s33, s32
+; GFX9-MUBUF-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-MUBUF-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s16, 18
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s35, 3
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s36, 4
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s37, 5
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s38, 6
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s39, 7
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s48, 8
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s49, 9
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s50, 10
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s51, 11
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s52, 12
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s53, 13
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s54, 14
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s55, 15
+; GFX9-MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s64, 16
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-MUBUF-NEXT:    s_addk_i32 s32, 0xc00
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v42, s65, 17
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[54:55], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB11_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb4
+; GFX9-MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-MUBUF-NEXT:    s_add_u32 s16, s16, func at gotpcrel32@lo+4
+; GFX9-MUBUF-NEXT:    s_addc_u32 s17, s17, func at gotpcrel32@hi+12
+; GFX9-MUBUF-NEXT:    s_load_dwordx2 s[64:65], s[16:17], 0x0
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[48:49], s[10:11]
+; GFX9-MUBUF-NEXT:    s_mov_b32 s50, s12
+; GFX9-MUBUF-NEXT:    s_mov_b32 s51, s13
+; GFX9-MUBUF-NEXT:    s_mov_b32 s52, s14
+; GFX9-MUBUF-NEXT:    s_mov_b32 s53, s15
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v41, v31
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:28
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:24
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20
+; GFX9-MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[10:11], s[48:49]
+; GFX9-MUBUF-NEXT:    s_mov_b32 s12, s50
+; GFX9-MUBUF-NEXT:    s_mov_b32 s13, s51
+; GFX9-MUBUF-NEXT:    s_mov_b32 s14, s52
+; GFX9-MUBUF-NEXT:    s_mov_b32 s15, s53
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v31, v41
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 16, v0
+; GFX9-MUBUF-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; GFX9-MUBUF-NEXT:  .LBB11_2: ; %bb5
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[54:55]
+; GFX9-MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s65, v42, 17
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s64, v42, 16
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s55, v42, 15
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s54, v42, 14
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s53, v42, 13
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s52, v42, 12
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s51, v42, 11
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s50, v42, 10
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s49, v42, 9
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s48, v42, 8
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s39, v42, 7
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s38, v42, 6
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s37, v42, 5
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s36, v42, 4
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s35, v42, 3
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s34, v42, 2
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-MUBUF-NEXT:    s_mov_b32 s32, s33
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s4, v42, 18
+; GFX9-MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-MUBUF-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-MUBUF-NEXT:    s_mov_b32 s33, s4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: undefined_stack_store_reg:
+; GFX9-FLATSCR:       ; %bb.0: ; %bb
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, s33
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s33, s32
+; GFX9-FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v44, s33 offset:32 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s0, 18
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s31, 1
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s34, 2
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s35, 3
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s36, 4
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s37, 5
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s38, 6
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s39, 7
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s48, 8
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s49, 9
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s50, 10
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s51, 11
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s52, 12
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s53, 13
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s54, 14
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s55, 15
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s64, 16
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s32, s32, 48
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v44, s65, 17
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[40:43], s0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[54:55], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB11_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb4
+; GFX9-FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-NEXT:    s_add_u32 s0, s0, func at gotpcrel32@lo+4
+; GFX9-FLATSCR-NEXT:    s_addc_u32 s1, s1, func at gotpcrel32@hi+12
+; GFX9-FLATSCR-NEXT:    s_load_dwordx2 s[64:65], s[0:1], 0x0
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[48:49], s[10:11]
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s50, s12
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s51, s13
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s52, s14
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s53, s15
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v41, v31
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s33, 16
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[10:11], s[48:49]
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s12, s50
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s13, s51
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s14, s52
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s15, s53
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v31, v41
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[40:43], s33 offset:16
+; GFX9-FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; GFX9-FLATSCR-NEXT:  .LBB11_2: ; %bb5
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[54:55]
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s65, v44, 17
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s64, v44, 16
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s55, v44, 15
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s54, v44, 14
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s53, v44, 13
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s52, v44, 12
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s51, v44, 11
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s50, v44, 10
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s49, v44, 9
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s48, v44, 8
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s39, v44, 7
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s38, v44, 6
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s37, v44, 5
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s36, v44, 4
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s35, v44, 3
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s34, v44, 2
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s32, s33
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s0, v44, 18
+; GFX9-FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v44, off, s33 offset:32 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s33, s0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: undefined_stack_store_reg:
+; GFX11-TRUE16:       ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s33 offset:32 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s0, 17
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, v0
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 48
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX11-TRUE16-NEXT:    scratch_store_b128 off, v[40:43], s0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s31, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s34, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s35, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s36, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s37, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s38, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s39, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s48, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s49, 9
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s50, 10
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s51, 11
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s52, 12
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s53, 13
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s54, 14
+; GFX11-TRUE16-NEXT:    s_mov_b32 s54, exec_lo
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s64, 15
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s65, 16
+; GFX11-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %bb4
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, func at gotpcrel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, func at gotpcrel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX11-TRUE16-NEXT:    s_load_b64 s[64:65], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[48:49], s[10:11]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s50, s12
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, v31
+; GFX11-TRUE16-NEXT:    s_mov_b32 s51, s13
+; GFX11-TRUE16-NEXT:    s_mov_b32 s52, s14
+; GFX11-TRUE16-NEXT:    s_mov_b32 s53, s15
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; GFX11-TRUE16-NEXT:    s_add_i32 s0, s33, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v41 :: v_dual_mov_b32 v0, s0
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[10:11], s[48:49]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s50
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s51
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s52
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s53
+; GFX11-TRUE16-NEXT:    scratch_store_b128 off, v[40:43], s33 offset:16
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; GFX11-TRUE16-NEXT:  .LBB11_2: ; %bb5
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s54
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s65, v44, 16
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s64, v44, 15
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s54, v44, 14
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s53, v44, 13
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s52, v44, 12
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s51, v44, 11
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s50, v44, 10
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v44, 9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v44, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v44, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v44, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v44, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v44, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v44, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v44, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v44, 17
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s33 offset:32 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: undefined_stack_store_reg:
+; GFX11-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s33 offset:32 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s0, 17
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, v0
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 48
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX11-FAKE16-NEXT:    scratch_store_b128 off, v[40:43], s0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s31, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s34, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s35, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s36, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s37, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s38, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s39, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s48, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s49, 9
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s50, 10
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s51, 11
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s52, 12
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s53, 13
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s54, 14
+; GFX11-FAKE16-NEXT:    s_mov_b32 s54, exec_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s64, 15
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s65, 16
+; GFX11-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %bb4
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, func at gotpcrel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, func at gotpcrel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX11-FAKE16-NEXT:    s_load_b64 s[64:65], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[48:49], s[10:11]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s50, s12
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, v31
+; GFX11-FAKE16-NEXT:    s_mov_b32 s51, s13
+; GFX11-FAKE16-NEXT:    s_mov_b32 s52, s14
+; GFX11-FAKE16-NEXT:    s_mov_b32 s53, s15
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; GFX11-FAKE16-NEXT:    s_add_i32 s0, s33, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, v41 :: v_dual_mov_b32 v0, s0
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[10:11], s[48:49]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s50
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s51
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s52
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s53
+; GFX11-FAKE16-NEXT:    scratch_store_b128 off, v[40:43], s33 offset:16
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[64:65]
+; GFX11-FAKE16-NEXT:  .LBB11_2: ; %bb5
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s54
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s65, v44, 16
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s64, v44, 15
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s54, v44, 14
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s53, v44, 13
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s52, v44, 12
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s51, v44, 11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s50, v44, 10
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v44, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v44, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v44, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v44, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v44, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v44, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v44, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v44, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v44, 17
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s33 offset:32 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = alloca <4 x float>, align 16, addrspace(5)
   %tmp2 = insertelement <4 x float> poison, float %arg, i32 0
@@ -285,6 +1375,88 @@ bb5:
 
 ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]]
 define void @alloca_ptr_nonentry_block(i32 %arg0) #0 {
+; CI-LABEL: alloca_ptr_nonentry_block:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CI-NEXT:    s_cbranch_execz .LBB12_2
+; CI-NEXT:  ; %bb.1: ; %bb
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v1, s32, 6
+; CI-NEXT:    v_or_b32_e32 v0, 4, v1
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:  .LBB12_2: ; %ret
+; CI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: alloca_ptr_nonentry_block:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB12_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX9-MUBUF-NEXT:    v_or_b32_e32 v0, 4, v1
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:  .LBB12_2: ; %ret
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: alloca_ptr_nonentry_block:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB12_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_or_b32_e64 v0, s32, 4
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:  .LBB12_2: ; %ret
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: alloca_ptr_nonentry_block:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %bb
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_or_b32_e64 v0, s32, 4
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:  .LBB12_2: ; %ret
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: alloca_ptr_nonentry_block:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %bb
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_or_b32_e64 v0, s32, 4
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:  .LBB12_2: ; %ret
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca { i8, i32 }, align 8, addrspace(5)
   %cmp = icmp eq i32 %arg0, 0
   br i1 %cmp, label %bb, label %ret
@@ -319,6 +1491,79 @@ ret:
 ; GFX11-FAKE16-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
 ; GFX11-FAKE16-NEXT:    s_endpgm
 define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
+; CI-LABEL: tied_operand_test:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT:    s_load_dword s4, s[8:9], 0x1
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_lshl_b32 s4, s4, 1
+; CI-NEXT:    v_mov_b32_e32 v2, s4
+; CI-NEXT:    ds_write_b16 v2, v1 offset:8
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b16 v2, v0 offset:10
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: tied_operand_test:
+; GFX9-MUBUF:       ; %bb.0: ; %entry
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX9-MUBUF-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-MUBUF-NEXT:    ds_write_b16 v2, v1 offset:8
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    ds_write_b16 v2, v0 offset:10
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: tied_operand_test:
+; GFX9-FLATSCR:       ; %bb.0: ; %entry
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT:    scratch_load_ushort v0, off, s0
+; GFX9-FLATSCR-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-FLATSCR-NEXT:    ds_write_b16 v2, v1 offset:8
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    ds_write_b16 v2, v0 offset:10
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: tied_operand_test:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, off
+; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x7b
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-TRUE16-NEXT:    ds_store_b16_d16_hi v1, v0 offset:8
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ds_store_b16 v1, v0 offset:10
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: tied_operand_test:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, off
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 0x7b :: v_dual_mov_b32 v2, s0
+; GFX11-FAKE16-NEXT:    ds_store_b16 v2, v1 offset:8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ds_store_b16 v2, v0 offset:10
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %scratch0 = alloca i16, align 4, addrspace(5)
   %scratch1 = alloca i16, align 4, addrspace(5)
@@ -345,6 +1590,115 @@ entry:
 ; GFX9-MUBUF-NEXT: v_add_u32_e32 [[SCALED_FP]], 0x3000, [[SCALED_FP]]
 ; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 64, [[SCALED_FP]]
 define void @fi_vop3_literal_error() {
+; CI-LABEL: fi_vop3_literal_error:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s33
+; CI-NEXT:    s_add_i32 s33, s32, 0x7ffc0
+; CI-NEXT:    s_and_b32 s33, s33, 0xfff80000
+; CI-NEXT:    v_lshr_b32_e64 v1, s33, 6
+; CI-NEXT:    s_movk_i32 vcc_lo, 0x3000
+; CI-NEXT:    v_add_i32_e32 v1, vcc, vcc_lo, v1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 64, v1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_mov_b32_e32 v2, 0x2000
+; CI-NEXT:    buffer_store_dword v1, v2, s[0:3], s33 offen
+; CI-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s5, s34
+; CI-NEXT:    s_mov_b32 s34, s32
+; CI-NEXT:    s_add_i32 s32, s32, 0x200000
+; CI-NEXT:    s_mov_b32 s32, s34
+; CI-NEXT:    s_mov_b32 s34, s5
+; CI-NEXT:    s_mov_b32 s33, s4
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: fi_vop3_literal_error:
+; GFX9-MUBUF:       ; %bb.0: ; %entry
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_mov_b32 s4, s33
+; GFX9-MUBUF-NEXT:    s_add_i32 s33, s32, 0x7ffc0
+; GFX9-MUBUF-NEXT:    s_and_b32 s33, s33, 0xfff80000
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x3000, v1
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 64, v1
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x2000
+; GFX9-MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], s33 offen
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_mov_b32 s5, s34
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_mov_b32 s34, s32
+; GFX9-MUBUF-NEXT:    s_add_i32 s32, s32, 0x200000
+; GFX9-MUBUF-NEXT:    ; kill: killed $vgpr0
+; GFX9-MUBUF-NEXT:    s_mov_b32 s32, s34
+; GFX9-MUBUF-NEXT:    s_mov_b32 s34, s5
+; GFX9-MUBUF-NEXT:    s_mov_b32 s33, s4
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: fi_vop3_literal_error:
+; GFX9-FLATSCR:       ; %bb.0: ; %entry
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, s33
+; GFX9-FLATSCR-NEXT:    s_add_i32 s33, s32, 0x1fff
+; GFX9-FLATSCR-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s1, s34
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s34, s32
+; GFX9-FLATSCR-NEXT:    s_add_i32 s32, s32, 0x8000
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s33, 0x2000
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v0, s2
+; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s33, 0x3000
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s2 offset:64 glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s32, s34
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s34, s1
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s33, s0
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fi_vop3_literal_error:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_add_i32 s33, s32, 0x1fff
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s34
+; GFX11-TRUE16-NEXT:    s_mov_b32 s34, s32
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 0x8000
+; GFX11-TRUE16-NEXT:    s_add_i32 s2, s33, 0x2000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s34
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v0, s2
+; GFX11-TRUE16-NEXT:    s_add_i32 s2, s33, 0x3000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s34, s1
+; GFX11-TRUE16-NEXT:    scratch_load_b64 v[0:1], off, s2 offset:64 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fi_vop3_literal_error:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_add_i32 s33, s32, 0x1fff
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, s34
+; GFX11-FAKE16-NEXT:    s_mov_b32 s34, s32
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 0x8000
+; GFX11-FAKE16-NEXT:    s_add_i32 s2, s33, 0x2000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s34
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s2
+; GFX11-FAKE16-NEXT:    s_add_i32 s2, s33, 0x3000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s34, s1
+; GFX11-FAKE16-NEXT:    scratch_load_b64 v[0:1], off, s2 offset:64 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %pin.low = alloca i32, align 8192, addrspace(5)
   %local.area = alloca [1060 x i64], align 4096, addrspace(5)
@@ -363,6 +1717,132 @@ entry:
 ; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010
 ; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0
 define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 {
+; CI-LABEL: fi_sop2_s_add_u32_literal_error:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_load_dword s5, s[8:9], 0x30
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    s_add_u32 s4, 0, 0x2010
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_addc_u32 s5, s5, 0
+; CI-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], 2
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; CI-NEXT:  .LBB15_1: ; %.shuffle.then.i.i.i.i
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; CI-NEXT:    s_cbranch_vccnz .LBB15_1
+; CI-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: fi_sop2_s_add_u32_literal_error:
+; GFX9-MUBUF:       ; %bb.0: ; %entry
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX9-MUBUF-NEXT:    s_add_u32 s4, 0, 0x2010
+; GFX9-MUBUF-NEXT:    s_addc_u32 s5, s5, 0
+; GFX9-MUBUF-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], 2
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-MUBUF-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-MUBUF-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; GFX9-MUBUF-NEXT:  .LBB15_1: ; %.shuffle.then.i.i.i.i
+; GFX9-MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-MUBUF-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_cbranch_vccnz .LBB15_1
+; GFX9-MUBUF-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: fi_sop2_s_add_u32_literal_error:
+; GFX9-FLATSCR:       ; %bb.0: ; %entry
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9-FLATSCR-NEXT:    s_add_u32 s0, 0, 0x2010
+; GFX9-FLATSCR-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-FLATSCR-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[0:1], 2
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s2, 0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s2
+; GFX9-FLATSCR-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-FLATSCR-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX9-FLATSCR-NEXT:  .LBB15_1: ; %.shuffle.then.i.i.i.i
+; GFX9-FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-FLATSCR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_cbranch_vccnz .LBB15_1
+; GFX9-FLATSCR-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT:    s_nop 1
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:4
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: fi_sop2_s_add_u32_literal_error:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, 0, 0x2010
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 1, v2
+; GFX11-TRUE16-NEXT:  .LBB15_1: ; %.shuffle.then.i.i.i.i
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_1
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX11-TRUE16-NEXT:    scratch_load_b64 v[0:1], off, off offset:4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fi_sop2_s_add_u32_literal_error:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, 0, 0x2010
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e64 s0, 1, v2
+; GFX11-FAKE16-NEXT:  .LBB15_1: ; %.shuffle.then.i.i.i.i
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB15_1
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX11-FAKE16-NEXT:    scratch_load_b64 v[0:1], off, off offset:4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5)
   %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5)
@@ -385,6 +1865,116 @@ vector.body.i.i.i.i:                              ; preds = %.shuffle.then.i.i.i
 ; GCN-LABEL: {{^}}fi_sop2_and_literal_error:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1fe00
 define amdgpu_kernel void @fi_sop2_and_literal_error() #0 {
+; CI-LABEL: fi_sop2_and_literal_error:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_mov_b64 s[4:5], -1
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; CI-NEXT:  .LBB16_1: ; %.shuffle.then.i.i.i.i
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; CI-NEXT:    s_cbranch_vccnz .LBB16_1
+; CI-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: fi_sop2_and_literal_error:
+; GFX9-MUBUF:       ; %bb.0: ; %entry
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[4:5], -1
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-MUBUF-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-MUBUF-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; GFX9-MUBUF-NEXT:  .LBB16_1: ; %.shuffle.then.i.i.i.i
+; GFX9-MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-MUBUF-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_cbranch_vccnz .LBB16_1
+; GFX9-MUBUF-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: fi_sop2_and_literal_error:
+; GFX9-FLATSCR:       ; %bb.0: ; %entry
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[0:1], -1
+; GFX9-FLATSCR-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-FLATSCR-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX9-FLATSCR-NEXT:  .LBB16_1: ; %.shuffle.then.i.i.i.i
+; GFX9-FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-FLATSCR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_cbranch_vccnz .LBB16_1
+; GFX9-FLATSCR-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT:    s_nop 1
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:4
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: fi_sop2_and_literal_error:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 1, v2
+; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s1
+; GFX11-TRUE16-NEXT:  .LBB16_1: ; %.shuffle.then.i.i.i.i
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB16_1
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX11-TRUE16-NEXT:    scratch_load_b64 v[0:1], off, off offset:4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fi_sop2_and_literal_error:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e64 s0, 1, v2
+; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s1
+; GFX11-FAKE16-NEXT:  .LBB16_1: ; %.shuffle.then.i.i.i.i
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB16_1
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX11-FAKE16-NEXT:    scratch_load_b64 v[0:1], off, off offset:4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5)
   %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5)
@@ -406,6 +1996,116 @@ vector.body.i.i.i.i:                              ; preds = %.shuffle.then.i.i.i
 ; GCN-LABEL: {{^}}fi_sop2_or_literal_error:
 ; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039
 define amdgpu_kernel void @fi_sop2_or_literal_error() #0 {
+; CI-LABEL: fi_sop2_or_literal_error:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_mov_b64 s[4:5], -1
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; CI-NEXT:  .LBB17_1: ; %.shuffle.then.i.i.i.i
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; CI-NEXT:    s_cbranch_vccnz .LBB17_1
+; CI-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: fi_sop2_or_literal_error:
+; GFX9-MUBUF:       ; %bb.0: ; %entry
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[4:5], -1
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-MUBUF-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-MUBUF-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; GFX9-MUBUF-NEXT:  .LBB17_1: ; %.shuffle.then.i.i.i.i
+; GFX9-MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-MUBUF-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_cbranch_vccnz .LBB17_1
+; GFX9-MUBUF-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: fi_sop2_or_literal_error:
+; GFX9-FLATSCR:       ; %bb.0: ; %entry
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[0:1], -1
+; GFX9-FLATSCR-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-FLATSCR-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX9-FLATSCR-NEXT:  .LBB17_1: ; %.shuffle.then.i.i.i.i
+; GFX9-FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-FLATSCR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_cbranch_vccnz .LBB17_1
+; GFX9-FLATSCR-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT:    s_nop 1
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:4
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: fi_sop2_or_literal_error:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 1, v2
+; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s1
+; GFX11-TRUE16-NEXT:  .LBB17_1: ; %.shuffle.then.i.i.i.i
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB17_1
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX11-TRUE16-NEXT:    scratch_load_b64 v[0:1], off, off offset:4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fi_sop2_or_literal_error:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e64 s0, 1, v2
+; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s1
+; GFX11-FAKE16-NEXT:  .LBB17_1: ; %.shuffle.then.i.i.i.i
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB17_1
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %vector.body.i.i.i.i
+; GFX11-FAKE16-NEXT:    scratch_load_b64 v[0:1], off, off offset:4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5)
   %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5)
@@ -435,6 +2135,76 @@ vector.body.i.i.i.i:                              ; preds = %.shuffle.then.i.i.i
 ; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0
 ; GCN: ; use [[SELECT]], [[ALLOCA0]]
 define amdgpu_kernel void @s_multiple_frame_indexes_literal_offsets(i32 inreg %arg0) #0 {
+; CI-LABEL: s_multiple_frame_indexes_literal_offsets:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s4, s[8:9], 0x0
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    s_movk_i32 s5, 0x44
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_cmp_eq_u32 s4, 0
+; CI-NEXT:    s_cselect_b32 s4, s5, 0x48
+; CI-NEXT:    s_mov_b32 s5, 0
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use s4, s5
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: s_multiple_frame_indexes_literal_offsets:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    s_movk_i32 s5, 0x44
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-MUBUF-NEXT:    s_cselect_b32 s4, s5, 0x48
+; GFX9-MUBUF-NEXT:    s_mov_b32 s5, 0
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use s4, s5
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: s_multiple_frame_indexes_literal_offsets:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s1, 0x44
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s1, 0x48
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s1, 0
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use s0, s1
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: s_multiple_frame_indexes_literal_offsets:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    s_movk_i32 s1, 0x44
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, s1, 0x48
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; use s0, s1
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_multiple_frame_indexes_literal_offsets:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    s_movk_i32 s1, 0x44
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s1, 0x48
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ; use s0, s1
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    s_endpgm
   %alloca0 = alloca [17 x i32], align 8, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
   %alloca2 = alloca i32, align 4, addrspace(5)
@@ -455,6 +2225,76 @@ define amdgpu_kernel void @s_multiple_frame_indexes_literal_offsets(i32 inreg %a
 ; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0
 ; GCN: ; use [[SELECT]], [[ALLOCA0]]
 define amdgpu_kernel void @s_multiple_frame_indexes_one_imm_one_literal_offset(i32 inreg %arg0) #0 {
+; CI-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s4, s[8:9], 0x0
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    s_mov_b32 s5, 64
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_cmp_eq_u32 s4, 0
+; CI-NEXT:    s_cselect_b32 s4, s5, 0x44
+; CI-NEXT:    s_mov_b32 s5, 0
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use s4, s5
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    s_mov_b32 s5, 64
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-MUBUF-NEXT:    s_cselect_b32 s4, s5, 0x44
+; GFX9-MUBUF-NEXT:    s_mov_b32 s5, 0
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use s4, s5
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s1, 64
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s1, 0x44
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s1, 0
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use s0, s1
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 64
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, s1, 0x44
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; use s0, s1
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 64
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s1, 0x44
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ; use s0, s1
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    s_endpgm
   %alloca0 = alloca [16 x i32], align 8, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
   %alloca2 = alloca i32, align 4, addrspace(5)
@@ -472,6 +2312,76 @@ define amdgpu_kernel void @s_multiple_frame_indexes_one_imm_one_literal_offset(i
 ; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0
 ; GCN: ; use [[SELECT]], [[ALLOCA0]]
 define amdgpu_kernel void @s_multiple_frame_indexes_imm_offsets(i32 inreg %arg0) #0 {
+; CI-LABEL: s_multiple_frame_indexes_imm_offsets:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s4, s[8:9], 0x0
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    s_mov_b32 s5, 16
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_cmp_eq_u32 s4, 0
+; CI-NEXT:    s_cselect_b32 s4, s5, 20
+; CI-NEXT:    s_mov_b32 s5, 0
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use s4, s5
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: s_multiple_frame_indexes_imm_offsets:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    s_mov_b32 s5, 16
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-MUBUF-NEXT:    s_cselect_b32 s4, s5, 20
+; GFX9-MUBUF-NEXT:    s_mov_b32 s5, 0
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use s4, s5
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: s_multiple_frame_indexes_imm_offsets:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s1, 16
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s1, 20
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s1, 0
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use s0, s1
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: s_multiple_frame_indexes_imm_offsets:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 16
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, s1, 20
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; use s0, s1
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_multiple_frame_indexes_imm_offsets:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s1, 20
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ; use s0, s1
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    s_endpgm
   %alloca0 = alloca [4 x i32], align 8, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
   %alloca2 = alloca i32, align 4, addrspace(5)
@@ -489,6 +2399,71 @@ define amdgpu_kernel void @s_multiple_frame_indexes_imm_offsets(i32 inreg %arg0)
 ; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}}
 ; GCN: ; use [[SELECT]], [[ALLOCA0]]
 define amdgpu_kernel void @v_multiple_frame_indexes_literal_offsets() #0 {
+; CI-LABEL: v_multiple_frame_indexes_literal_offsets:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    v_mov_b32_e32 v1, 0x48
+; CI-NEXT:    v_mov_b32_e32 v2, 0x44
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use v0, v1
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: v_multiple_frame_indexes_literal_offsets:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x48
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x44
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use v0, v1
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: v_multiple_frame_indexes_literal_offsets:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x48
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x44
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use v0, v1
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: v_multiple_frame_indexes_literal_offsets:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x44
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, 0x48, v1 :: v_dual_mov_b32 v1, 0
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; use v0, v1
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_multiple_frame_indexes_literal_offsets:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0x44
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, 0x48, v1 :: v_dual_mov_b32 v1, 0
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ; use v0, v1
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    s_endpgm
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca0 = alloca [17 x i32], align 8, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
@@ -507,6 +2482,69 @@ define amdgpu_kernel void @v_multiple_frame_indexes_literal_offsets() #0 {
 ; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}}
 ; GCN: ; use [[SELECT]], [[ALLOCA0]]
 define amdgpu_kernel void @v_multiple_frame_indexes_one_imm_one_literal_offset() #0 {
+; CI-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    v_mov_b32_e32 v1, 0x44
+; CI-NEXT:    v_mov_b32_e32 v2, 64
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use v0, v1
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x44
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 64
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use v0, v1
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x44
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 64
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use v0, v1
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 64 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, 0x44, v1 :: v_dual_mov_b32 v1, 0
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; use v0, v1
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 64 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, 0x44, v1 :: v_dual_mov_b32 v1, 0
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ; use v0, v1
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    s_endpgm
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca0 = alloca [16 x i32], align 8, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
@@ -525,6 +2563,69 @@ define amdgpu_kernel void @v_multiple_frame_indexes_one_imm_one_literal_offset()
 ; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}}
 ; GCN: ; use [[SELECT]], [[ALLOCA0]]
 define amdgpu_kernel void @v_multiple_frame_indexes_imm_offsets() #0 {
+; CI-LABEL: v_multiple_frame_indexes_imm_offsets:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_add_u32 s0, s0, s17
+; CI-NEXT:    v_mov_b32_e32 v1, 12
+; CI-NEXT:    v_mov_b32_e32 v2, 8
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use v0, v1
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: v_multiple_frame_indexes_imm_offsets:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 12
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use v0, v1
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: v_multiple_frame_indexes_imm_offsets:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 12
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use v0, v1
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: v_multiple_frame_indexes_imm_offsets:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 8 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, 12, v1 :: v_dual_mov_b32 v1, 0
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; use v0, v1
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_multiple_frame_indexes_imm_offsets:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 8 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, 12, v1 :: v_dual_mov_b32 v1, 0
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ; use v0, v1
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    s_endpgm
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca0 = alloca [2 x i32], align 8, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
@@ -536,3 +2637,7 @@ define amdgpu_kernel void @v_multiple_frame_indexes_imm_offsets() #0 {
 }
 
 attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX9: {{.*}}
+; MUBUF: {{.*}}



More information about the llvm-commits mailing list