[llvm] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by constant (PR #71035)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 3 05:17:54 PDT 2023


https://github.com/sstipanovic updated https://github.com/llvm/llvm-project/pull/71035

>From 7f0610c4cb4530957d1d556dcbc95cb38b79ba96 Mon Sep 17 00:00:00 2001
From: Stefan Stipanovic <stefan.stipanovic at syrmia.com>
Date: Fri, 3 Nov 2023 11:49:15 +0100
Subject: [PATCH] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by
 constant

---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |   14 +
 .../CodeGen/AMDGPU/frame-index-elimination.ll | 1155 ++++++++++++++---
 llvm/test/CodeGen/AMDGPU/mul.ll               |   39 +
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |    8 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll    |   14 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |    4 +-
 6 files changed, 1065 insertions(+), 169 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c0e0ac1b4ec8873..114d33b077866a1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -515,6 +515,16 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
   let HasExtVOP3DPP = 0;
 }
 
+def IsPow2Plus1: PatLeaf<(i32 imm), [{
+  uint32_t V = N->getZExtValue();
+  return isPowerOf2_32(V - 1);
+}]>;
+
+def Log2_32: SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(V - 1), SDLoc(N), MVT::i32);
+}]>;
+
 let SubtargetPredicate = isGFX9Plus in {
 let isCommutable = 1, isReMaterializable = 1 in {
   defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -612,6 +622,10 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
 def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
 def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
 
+def : GCNPat<
+ (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
+ (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
+
 let SubtargetPredicate = isGFX940Plus in
 def : GCNPat<
   (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 028f32844576f6d..faba4c73e05697f 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
@@ -7,44 +8,91 @@
 ; give an index relative to the scratch wave offset register
 
 ; Materialize into a mov. Make sure there isn't an unnecessary copy.
-; GCN-LABEL: {{^}}func_mov_fi_i32:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 
-; CI-NEXT: v_lshr_b32_e64 v0, s32, 6
-; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-
-; GFX9-FLATSCR:     v_mov_b32_e32 v0, s32
-; GFX9-FLATSCR-NOT: v_lshrrev_b32_e64
-
-; MUBUF-NOT: v_mov
-
-; GCN: ds_write_b32 v0, v0
 define void @func_mov_fi_i32() #0 {
+; CI-LABEL: func_mov_fi_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_mov_fi_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_mov_fi_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_mov_fi_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile ptr addrspace(5) %alloca, ptr addrspace(3) undef
   ret void
 }
 
 ; Offset due to different objects
-; GCN-LABEL: {{^}}func_mov_fi_i32_offset:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-
-; CI-DAG: v_lshr_b32_e64 v0, s32, 6
-; CI-NOT: v_mov
-; CI: ds_write_b32 v0, v0
-; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
-; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]]
-; CI-NEXT: ds_write_b32 v0, v0
-
-; GFX9-MUBUF-NEXT:   v_lshrrev_b32_e64 v0, 6, s32
-; GFX9-FLATSCR:      v_mov_b32_e32 v0, s32
-; GFX9-FLATSCR:      s_add_i32 [[ADD:[^,]+]], s32, 4
-; GFX9-NEXT:         ds_write_b32 v0, v0
-; GFX9-MUBUF-NEXT:   v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-MUBUF-NEXT:   v_add_u32_e32 v0, 4, [[SCALED]]
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, [[ADD]]
-; GFX9-NEXT:         ds_write_b32 v0, v0
 define void @func_mov_fi_i32_offset() #0 {
+; CI-LABEL: func_mov_fi_i32_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_mov_fi_i32_offset:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_mov_fi_i32_offset:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s32, 4
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_mov_fi_i32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s32, 4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca i32, addrspace(5)
   %alloca1 = alloca i32, addrspace(5)
   store volatile ptr addrspace(5) %alloca0, ptr addrspace(3) undef
@@ -55,21 +103,42 @@ define void @func_mov_fi_i32_offset() #0 {
 ; Materialize into an add of a constant offset from the FI.
 ; FIXME: Should be able to merge adds
 
-; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-
-; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
-; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
-
-; GFX9-MUBUF:       v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-MUBUF-NEXT:  v_add_u32_e32 v0, 4, [[SCALED]]
-
-; GFX9-FLATSCR:      v_mov_b32_e32 [[ADD:v[0-9]+]], s32
-; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]]
-
-; GCN-NOT: v_mov
-; GCN: ds_write_b32 v0, v0
 define void @func_add_constant_to_fi_i32() #0 {
+; CI-LABEL: func_add_constant_to_fi_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_add_constant_to_fi_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_add_constant_to_fi_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_add_constant_to_fi_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e64 v0, 4, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [2 x i32], align 4, addrspace(5)
   %gep0 = getelementptr inbounds [2 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
   store volatile ptr addrspace(5) %gep0, ptr addrspace(3) undef
@@ -81,17 +150,42 @@ define void @func_add_constant_to_fi_i32() #0 {
 ; FIXME: Should use s_mul but the frame index always gets materialized into a
 ; vgpr
 
-; GCN-LABEL: {{^}}func_other_fi_user_i32:
-
-; CI: v_lshr_b32_e64 v0, s32, 6
-
-; GFX9-MUBUF:   v_lshrrev_b32_e64 v0, 6, s32
-; GFX9-FLATSCR: v_mov_b32_e32 v0, s32
-
-; GCN-NEXT: v_mul_lo_u32 v0, v0, 9
-; GCN-NOT: v_mov
-; GCN: ds_write_b32 v0, v0
 define void @func_other_fi_user_i32() #0 {
+; CI-LABEL: func_other_fi_user_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_other_fi_user_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mul_lo_u32 v0, s32, 9
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [2 x i32], align 4, addrspace(5)
   %ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
   %mul = mul i32 %ptrtoint, 9
@@ -99,39 +193,110 @@ define void @func_other_fi_user_i32() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
-; GCN: v_mov_b32_e32 v1, 15{{$}}
-; MUBUF:        buffer_store_dword v1, v0, s[0:3], 0 offen{{$}}
-; GFX9-FLATSCR: scratch_store_dword v0, v1, off{{$}}
 define void @func_store_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
+; CI-LABEL: func_store_private_arg_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v1, 15
+; CI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_store_private_arg_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_store_private_arg_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-FLATSCR-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_store_private_arg_i32_ptr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile i32 15, ptr addrspace(5) %ptr
   ret void
 }
 
-; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
-; GCN: s_waitcnt
-; MUBUF-NEXT:        buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}}
-; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}}
 define void @func_load_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
+; CI-LABEL: func_load_private_arg_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_load_private_arg_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_load_private_arg_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_load_private_arg_i32_ptr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i32, ptr addrspace(5) %ptr
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
-; GCN: s_waitcnt
-
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
-
-; GFX9-MUBUF:      v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
-; GFX9-MUBUF-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
-
-; GFX9-FLATSCR:      v_mov_b32_e32 [[SP:v[0-9]+]], s32
-; GFX9-FLATSCR-NEXT: v_or_b32_e32 v0, 4, [[SP]]
-
-; GCN-NOT: v_mov
-; GCN: ds_write_b32 v0, v0
 define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_or_b32_e32 v0, 4, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_or_b32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_or_b32_e32 v0, 4, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_or_b32_e64 v0, 4, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
   %load1 = load i32, ptr addrspace(5) %gep1
@@ -139,13 +304,57 @@ define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT: buffer_load_ubyte v0, off, s[0:3], s32
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32
-; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4
+
 define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    ds_write_b8 v0, v0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b32 v0, v1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    ds_write_b8 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v1
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_load_ubyte v0, off, s32
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v1, off, s32 offset:4
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    ds_write_b8 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v1
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_u8 v0, off, s32
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    ds_store_b8 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
   %load0 = load i8, ptr addrspace(5) %gep0
@@ -155,24 +364,75 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
-
-; GCN: s_and_saveexec_b64
 
-; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
-; GFX9-MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
-; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}}
-
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
-
-; GFX9-MUBUF:   v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
-; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32
-
-; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
-
-; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CI-NEXT:    s_cbranch_execz .LBB8_2
+; CI-NEXT:  ; %bb.1: ; %bb
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:  .LBB8_2: ; %ret
+; CI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:  .LBB8_2: ; %ret
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:  .LBB8_2: ; %ret
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB8_2
+; GFX11-NEXT:  ; %bb.1: ; %bb
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e64 v0, 4, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:  .LBB8_2: ; %ret
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %bb, label %ret
 
@@ -187,22 +447,59 @@ ret:
   ret void
 }
 
-; Added offset can't be used with VOP3 add
-; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
-
-; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200
-; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
-; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]]
-
-; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-MUBUF:     v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
-
-; GFX9-FLATSCR-DAG: s_add_i32 [[SZ:[^,]+]], s32, 0x200
-; GFX9-FLATSCR:     v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]]
-
-; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9
-; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
+; CI-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_movk_i32 vcc_lo, 0x200
+; CI-NEXT:    v_add_i32_e32 v0, vcc, vcc_lo, v0
+; CI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; CI-NEXT:    v_mov_b32_e32 v1, 7
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 0x200, v0
+; GFX9-MUBUF-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 7
+; GFX9-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s32, 0x200
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 7
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:260
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x200
+; GFX11-NEXT:    v_mov_b32_e32 v1, 7
+; GFX11-NEXT:    v_mul_lo_u32 v0, s0, 9
+; GFX11-NEXT:    scratch_store_b32 off, v1, s32 offset:260 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
   %gep0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca0, i32 0, i32 65
@@ -213,21 +510,83 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
-
-; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
-; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
-; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
-
-; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-MUBUF:     v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
-
-; GFX9-FLATSCR-DAG: s_add_i32 [[SZ:[^,]+]], s32, 0x200
-; GFX9-FLATSCR:     v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]]
-
-; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9
-; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 {
+; CI-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, 7
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; def vcc
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_movk_i32 s4, 0x200
+; CI-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; CI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use vcc
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; def vcc
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 0x200, v0
+; GFX9-MUBUF-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use vcc
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s32, 0x200
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; def vcc
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:260
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use vcc
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x200
+; GFX11-NEXT:    v_mov_b32_e32 v1, 7
+; GFX11-NEXT:    v_mul_lo_u32 v0, s0, 9
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; def vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    scratch_store_b32 off, v1, s32 offset:260 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
   %vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
@@ -245,17 +604,392 @@ declare void @func(ptr addrspace(5) nocapture) #0
 ; undef flag not preserved in eliminateFrameIndex when handling the
 ; stores in the middle block.
 
-; GCN-LABEL: {{^}}undefined_stack_store_reg:
-; GCN: s_and_saveexec_b64
-; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
-; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
-; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
-; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:
-; FLATSCR: scratch_store_dword v0, off, s33 offset:
-; FLATSCR: scratch_store_dword v0, off, s33 offset:
-; FLATSCR: scratch_store_dword v0, off, s33 offset:
-; FLATSCR: scratch_store_dword v{{[0-9]+}}, off, s33 offset:
 define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
+; CI-LABEL: undefined_stack_store_reg:
+; CI:       ; %bb.0: ; %bb
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s16, s33
+; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; CI-NEXT:    s_mov_b64 exec, s[18:19]
+; CI-NEXT:    v_writelane_b32 v40, s16, 18
+; CI-NEXT:    v_writelane_b32 v40, s30, 0
+; CI-NEXT:    v_writelane_b32 v40, s31, 1
+; CI-NEXT:    v_writelane_b32 v40, s34, 2
+; CI-NEXT:    v_writelane_b32 v40, s35, 3
+; CI-NEXT:    v_writelane_b32 v40, s36, 4
+; CI-NEXT:    v_writelane_b32 v40, s37, 5
+; CI-NEXT:    v_writelane_b32 v40, s38, 6
+; CI-NEXT:    v_writelane_b32 v40, s39, 7
+; CI-NEXT:    v_writelane_b32 v40, s40, 8
+; CI-NEXT:    v_writelane_b32 v40, s41, 9
+; CI-NEXT:    v_writelane_b32 v40, s42, 10
+; CI-NEXT:    v_writelane_b32 v40, s43, 11
+; CI-NEXT:    v_writelane_b32 v40, s44, 12
+; CI-NEXT:    v_writelane_b32 v40, s45, 13
+; CI-NEXT:    v_writelane_b32 v40, s46, 14
+; CI-NEXT:    v_writelane_b32 v40, s47, 15
+; CI-NEXT:    v_writelane_b32 v40, s48, 16
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    v_writelane_b32 v40, s49, 17
+; CI-NEXT:    v_mov_b32_e32 v41, v0
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    s_addk_i32 s32, 0xc00
+; CI-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; CI-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; CI-NEXT:    s_cbranch_execz .LBB11_2
+; CI-NEXT:  ; %bb.1: ; %bb4
+; CI-NEXT:    s_getpc_b64 s[16:17]
+; CI-NEXT:    s_add_u32 s16, s16, func at gotpcrel32@lo+4
+; CI-NEXT:    s_addc_u32 s17, s17, func at gotpcrel32@hi+12
+; CI-NEXT:    s_load_dwordx2 s[48:49], s[16:17], 0x0
+; CI-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; CI-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; CI-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; CI-NEXT:    s_mov_b64 s[40:41], s[10:11]
+; CI-NEXT:    s_mov_b32 s42, s12
+; CI-NEXT:    s_mov_b32 s43, s13
+; CI-NEXT:    s_mov_b32 s44, s14
+; CI-NEXT:    s_mov_b32 s45, s15
+; CI-NEXT:    v_mov_b32_e32 v42, v31
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:28
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:24
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16
+; CI-NEXT:    v_lshr_b32_e64 v0, s33, 6
+; CI-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; CI-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; CI-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CI-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; CI-NEXT:    s_mov_b32 s12, s42
+; CI-NEXT:    s_mov_b32 s13, s43
+; CI-NEXT:    s_mov_b32 s14, s44
+; CI-NEXT:    s_mov_b32 s15, s45
+; CI-NEXT:    v_mov_b32_e32 v31, v42
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; CI-NEXT:  .LBB11_2: ; %bb5
+; CI-NEXT:    s_or_b64 exec, exec, s[46:47]
+; CI-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CI-NEXT:    v_readlane_b32 s49, v40, 17
+; CI-NEXT:    v_readlane_b32 s48, v40, 16
+; CI-NEXT:    v_readlane_b32 s47, v40, 15
+; CI-NEXT:    v_readlane_b32 s46, v40, 14
+; CI-NEXT:    v_readlane_b32 s45, v40, 13
+; CI-NEXT:    v_readlane_b32 s44, v40, 12
+; CI-NEXT:    v_readlane_b32 s43, v40, 11
+; CI-NEXT:    v_readlane_b32 s42, v40, 10
+; CI-NEXT:    v_readlane_b32 s41, v40, 9
+; CI-NEXT:    v_readlane_b32 s40, v40, 8
+; CI-NEXT:    v_readlane_b32 s39, v40, 7
+; CI-NEXT:    v_readlane_b32 s38, v40, 6
+; CI-NEXT:    v_readlane_b32 s37, v40, 5
+; CI-NEXT:    v_readlane_b32 s36, v40, 4
+; CI-NEXT:    v_readlane_b32 s35, v40, 3
+; CI-NEXT:    v_readlane_b32 s34, v40, 2
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
+; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s4, v40, 18
+; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; CI-NEXT:    s_mov_b64 exec, s[6:7]
+; CI-NEXT:    s_addk_i32 s32, 0xf400
+; CI-NEXT:    s_mov_b32 s33, s4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: undefined_stack_store_reg:
+; GFX9-MUBUF:       ; %bb.0: ; %bb
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_mov_b32 s16, s33
+; GFX9-MUBUF-NEXT:    s_mov_b32 s33, s32
+; GFX9-MUBUF-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s16, 18
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s40, 8
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s41, 9
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s42, 10
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s43, 11
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s44, 12
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s45, 13
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s46, 14
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s47, 15
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s48, 16
+; GFX9-MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s49, 17
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-MUBUF-NEXT:    s_addk_i32 s32, 0xc00
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB11_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb4
+; GFX9-MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-MUBUF-NEXT:    s_add_u32 s16, s16, func at gotpcrel32@lo+4
+; GFX9-MUBUF-NEXT:    s_addc_u32 s17, s17, func at gotpcrel32@hi+12
+; GFX9-MUBUF-NEXT:    s_load_dwordx2 s[48:49], s[16:17], 0x0
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[40:41], s[10:11]
+; GFX9-MUBUF-NEXT:    s_mov_b32 s42, s12
+; GFX9-MUBUF-NEXT:    s_mov_b32 s43, s13
+; GFX9-MUBUF-NEXT:    s_mov_b32 s44, s14
+; GFX9-MUBUF-NEXT:    s_mov_b32 s45, s15
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v42, v31
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:28
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:24
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20
+; GFX9-MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; GFX9-MUBUF-NEXT:    s_mov_b32 s12, s42
+; GFX9-MUBUF-NEXT:    s_mov_b32 s13, s43
+; GFX9-MUBUF-NEXT:    s_mov_b32 s14, s44
+; GFX9-MUBUF-NEXT:    s_mov_b32 s15, s45
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v31, v42
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 16, v0
+; GFX9-MUBUF-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX9-MUBUF-NEXT:  .LBB11_2: ; %bb5
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[46:47]
+; GFX9-MUBUF-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s49, v40, 17
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s48, v40, 16
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s47, v40, 15
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s46, v40, 14
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s45, v40, 13
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s44, v40, 12
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s43, v40, 11
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s42, v40, 10
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s41, v40, 9
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s4, v40, 18
+; GFX9-MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-MUBUF-NEXT:    s_addk_i32 s32, 0xf400
+; GFX9-MUBUF-NEXT:    s_mov_b32 s33, s4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: undefined_stack_store_reg:
+; GFX9-FLATSCR:       ; %bb.0: ; %bb
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, s33
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s33, s32
+; GFX9-FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v40, s33 offset:32 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s40, 8
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s41, 9
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s42, 10
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s43, 11
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s44, 12
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s45, 13
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s46, 14
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s47, 15
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s48, 16
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s49, 17
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s32, s32, 48
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[41:44], s0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB11_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb4
+; GFX9-FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-NEXT:    s_add_u32 s0, s0, func at gotpcrel32@lo+4
+; GFX9-FLATSCR-NEXT:    s_addc_u32 s1, s1, func at gotpcrel32@hi+12
+; GFX9-FLATSCR-NEXT:    s_load_dwordx2 s[48:49], s[0:1], 0x0
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[40:41], s[10:11]
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s42, s12
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s43, s13
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s44, s14
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s45, s15
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v42, v31
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s33, 16
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s12, s42
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s13, s43
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s14, s44
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s15, s45
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v31, v42
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[41:44], s33 offset:16
+; GFX9-FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX9-FLATSCR-NEXT:  .LBB11_2: ; %bb5
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[46:47]
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v42, off, s33 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s49, v40, 17
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s48, v40, 16
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s47, v40, 15
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s46, v40, 14
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s45, v40, 13
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s44, v40, 12
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s43, v40, 11
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s42, v40, 10
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s41, v40, 9
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX9-FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:32 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-FLATSCR-NEXT:    s_addk_i32 s32, 0xffd0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s33, s0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: undefined_stack_store_reg:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:32 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 17
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    v_mov_b32_e32 v41, v0
+; GFX11-NEXT:    s_add_i32 s32, s32, 48
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s40, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s41, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s42, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s43, 11
+; GFX11-NEXT:    v_writelane_b32 v40, s44, 12
+; GFX11-NEXT:    v_writelane_b32 v40, s45, 13
+; GFX11-NEXT:    v_writelane_b32 v40, s46, 14
+; GFX11-NEXT:    s_mov_b32 s46, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 15
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 16
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX11-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-NEXT:  ; %bb.1: ; %bb4
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, func at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, func at gotpcrel32@hi+12
+; GFX11-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX11-NEXT:    s_load_b64 s[48:49], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX11-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX11-NEXT:    s_mov_b64 s[40:41], s[10:11]
+; GFX11-NEXT:    s_mov_b32 s42, s12
+; GFX11-NEXT:    v_mov_b32_e32 v42, v31
+; GFX11-NEXT:    s_mov_b32 s43, s13
+; GFX11-NEXT:    s_mov_b32 s44, s14
+; GFX11-NEXT:    s_mov_b32 s45, s15
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX11-NEXT:    s_add_i32 s0, s33, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX11-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX11-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; GFX11-NEXT:    s_mov_b32 s12, s42
+; GFX11-NEXT:    s_mov_b32 s13, s43
+; GFX11-NEXT:    s_mov_b32 s14, s44
+; GFX11-NEXT:    s_mov_b32 s15, s45
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s33 offset:16
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX11-NEXT:  .LBB11_2: ; %bb5
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s46
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 16
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 15
+; GFX11-NEXT:    v_readlane_b32 s46, v40, 14
+; GFX11-NEXT:    v_readlane_b32 s45, v40, 13
+; GFX11-NEXT:    v_readlane_b32 s44, v40, 12
+; GFX11-NEXT:    v_readlane_b32 s43, v40, 11
+; GFX11-NEXT:    v_readlane_b32 s42, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s41, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 17
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:32 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_addk_i32 s32, 0xffd0
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = alloca <4 x float>, align 16, addrspace(5)
   %tmp2 = insertelement <4 x float> undef, float %arg, i32 0
@@ -273,22 +1007,74 @@ bb5:
   ret void
 }
 
-; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
-; GCN: s_and_saveexec_b64
-; MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
-; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4
-
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
-
-; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
-; GFX9-MUBUF-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
-
-; GFX9-FLATSCR:      v_mov_b32_e32 [[SP:v[0-9]+]], s32
-; GFX9-FLATSCR-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SP]]
-
-; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]]
 define void @alloca_ptr_nonentry_block(i32 %arg0) #0 {
+; CI-LABEL: alloca_ptr_nonentry_block:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CI-NEXT:    s_cbranch_execz .LBB12_2
+; CI-NEXT:  ; %bb.1: ; %bb
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_or_b32_e32 v0, 4, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:  .LBB12_2: ; %ret
+; CI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: alloca_ptr_nonentry_block:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB12_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_or_b32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:  .LBB12_2: ; %ret
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: alloca_ptr_nonentry_block:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB12_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_or_b32_e32 v0, 4, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:  .LBB12_2: ; %ret
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: alloca_ptr_nonentry_block:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-NEXT:  ; %bb.1: ; %bb
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_or_b32_e64 v0, 4, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:  .LBB12_2: ; %ret
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca { i8, i32 }, align 8, addrspace(5)
   %cmp = icmp eq i32 %arg0, 0
   br i1 %cmp, label %bb, label %ret
@@ -308,14 +1094,67 @@ ret:
 %type.i16 = type { i16 }
 @_ZZN0 = external hidden addrspace(3) global %struct0, align 8
 
+define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
+; CI-LABEL: tied_operand_test:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_add_u32 s0, s0, s9
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_load_dword s4, s[4:5], 0x1
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_lshl_b32 s4, s4, 1
+; CI-NEXT:    v_mov_b32_e32 v2, s4
+; CI-NEXT:    ds_write_b16 v2, v1 offset:8
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b16 v2, v0 offset:10
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: tied_operand_test:
+; GFX9-MUBUF:       ; %bb.0: ; %entry
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s9
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    s_load_dword s4, s[4:5], 0x4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-MUBUF-NEXT:    ds_write_b16 v1, v2 offset:8
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    ds_write_b16 v1, v0 offset:10
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: tied_operand_test:
+; GFX9-FLATSCR:       ; %bb.0: ; %entry
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s2, 0
+; GFX9-FLATSCR-NEXT:    scratch_load_ushort v0, off, s2 offset:4
+; GFX9-FLATSCR-NEXT:    s_load_dword s0, s[0:1], 0x4
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-FLATSCR-NEXT:    ds_write_b16 v1, v2 offset:8
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    ds_write_b16 v1, v0 offset:10
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: tied_operand_test:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4
-; GFX11-DAG:     v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
-; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
-; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
+; GFX11-NEXT:    scratch_load_u16 v0, off, off offset:4
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x4
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    ds_store_b16 v1, v2 offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ds_store_b16 v1, v0 offset:10
 ; GFX11-NEXT:    s_endpgm
-define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
 entry:
   %scratch0 = alloca i16, align 4, addrspace(5)
   %scratch1 = alloca i16, align 4, addrspace(5)
@@ -333,3 +1172,7 @@ entry:
 }
 
 attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX9: {{.*}}
+; MUBUF: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b4e9376d8277737..350163633c2e951 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2395,6 +2395,45 @@ entry:
   ret void
 }
 
+define i32 @mul_pow2_plus_1(i32 %val) {
+; SI-LABEL: mul_pow2_plus_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mul_pow2_plus_1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_pow2_plus_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_pow2_plus_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_pow2_plus_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mul_pow2_plus_1:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+  %mul = mul i32 %val, 9
+  ret i32 %mul
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 930ba80ad69638d..d4e936e34a29150 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -141,9 +141,9 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT: bb.2.Flow:
   ; SI-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4
-  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4
-  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4
+  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %31:vgpr_32, %bb.1, %10, %bb.4
+  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %9, %bb.4
+  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %34:vgpr_32, %bb.4
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.3
   ; SI-NEXT: {{  $}}
@@ -158,7 +158,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec
-  ; SI-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
+  ; SI-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.5.if.end:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index c71dc06c68d8d68..9183f043f052cb4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -92,20 +92,20 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-NEXT:  .LBB2_1: ; %if.end
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; SI-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; SI-NEXT:    v_add_nc_u32_e32 v2, 1, v0
+; SI-NEXT:    v_add_nc_u32_e32 v2, 1, v3
 ; SI-NEXT:    s_add_i32 s1, s1, 1
 ; SI-NEXT:    s_cmp_lt_i32 s1, s0
 ; SI-NEXT:    s_cbranch_scc0 .LBB2_6
 ; SI-NEXT:  .LBB2_2: ; %for.body
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; SI-NEXT:    s_xor_b32 s2, exec_lo, s2
 ; SI-NEXT:  ; %bb.3: ; %else
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT:    v_mul_lo_u32 v0, v2, 3
-; SI-NEXT:    v_mul_f32_e32 v3, v1, v2
+; SI-NEXT:    v_mul_f32_e32 v0, v1, v2
+; SI-NEXT:    v_lshl_add_u32 v3, v2, 1, v2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  ; %bb.4: ; %Flow
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
@@ -113,11 +113,11 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-NEXT:    s_cbranch_execz .LBB2_1
 ; SI-NEXT:  ; %bb.5: ; %if
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT:    v_mul_f32_e32 v3, s1, v1
-; SI-NEXT:    v_add_nc_u32_e32 v0, 1, v2
+; SI-NEXT:    v_mul_f32_e32 v0, s1, v1
+; SI-NEXT:    v_add_nc_u32_e32 v3, 1, v2
 ; SI-NEXT:    s_branch .LBB2_1
 ; SI-NEXT:  .LBB2_6: ; %for.end
-; SI-NEXT:    v_add_f32_e32 v0, v0, v3
+; SI-NEXT:    v_add_f32_e32 v0, v3, v0
 ; SI-NEXT:    ; return to shader part epilog
 entry:
 ;  %break = icmp sgt i32 %bound, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95f947cbca14f05..6bb066f06dd9a24 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1536,7 +1536,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
 ; GFX9-W64-NEXT:  ; %bb.3: ; %IF
-; GFX9-W64-NEXT:    v_mul_lo_u32 v0, v5, 3
+; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
 ; GFX9-W64-NEXT:  ; %bb.4: ; %END
 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -1566,7 +1566,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
 ; GFX10-W32-NEXT:  ; %bb.3: ; %IF
-; GFX10-W32-NEXT:    v_mul_lo_u32 v0, v5, 3
+; GFX10-W32-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
 ; GFX10-W32-NEXT:  ; %bb.4: ; %END
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12



More information about the llvm-commits mailing list