[llvm] [AMDGPU][MISched] Allow memory ops of different base pointers to be clustered (PR #140674)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 21 17:12:10 PDT 2025
https://github.com/choikwa updated https://github.com/llvm/llvm-project/pull/140674
>From 584a6437a001e6bfe77e1be5432c5caad418f802 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Mon, 12 May 2025 18:52:02 -0500
Subject: [PATCH 1/6] [AMDGPU][MISched] Allow memory ops of different base
pointers to be clustered
This patch relaxes same base pointer requirement for memory ops clustering by only testing for identical addrspace.
In testing, it has been observed that clustering memory ops with different base pointers can improve performance.
In particular, Babelstream dot_kernel(double) performed up to 15% better with clustered memory loads with different base pointers.
Internal CQE testing did not show significant regressions.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 37 +-
.../CodeGen/AMDGPU/GlobalISel/add.vni16.ll | 228 +-
...licit-kernarg-backend-usage-global-isel.ll | 8 +-
.../GlobalISel/llvm.amdgcn.intersect_ray.ll | 6 +
.../AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll | 12 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll | 4 +
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 8 +-
.../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 4 +
.../wmma-gfx12-w32-swmmac-index_key.ll | 26 +-
.../wmma-gfx12-w64-swmmac-index_key.ll | 11 +
llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 2 +
.../AMDGPU/agpr-copy-no-free-registers.ll | 50 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 17965 ++++++++--------
.../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 414 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 2596 ++-
.../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 112 +-
.../test/CodeGen/AMDGPU/array-ptr-calc-i32.ll | 3 +-
...tor-flatscratchinit-undefined-behavior2.ll | 164 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 87 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 11 +-
llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 4 +
llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 1 +
llvm/test/CodeGen/AMDGPU/clamp.ll | 1 +
llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 1 +
.../AMDGPU/constant-address-space-32bit.ll | 977 +-
.../CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll | 27 +-
llvm/test/CodeGen/AMDGPU/ctpop16.ll | 12 +-
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 17 +-
.../AMDGPU/divergence-driven-buildvector.ll | 2 +
llvm/test/CodeGen/AMDGPU/ds_read2.ll | 16 +-
llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 56 +
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 41 +-
llvm/test/CodeGen/AMDGPU/fma-combine.ll | 21 +-
llvm/test/CodeGen/AMDGPU/fmed3.ll | 3 +
llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 92 +-
llvm/test/CodeGen/AMDGPU/frem.ll | 24 +-
llvm/test/CodeGen/AMDGPU/fsub.f16.ll | 29 +-
.../CodeGen/AMDGPU/function-args-inreg.ll | 84 +-
llvm/test/CodeGen/AMDGPU/function-args.ll | 158 +-
.../AMDGPU/gfx-callable-argument-types.ll | 4 +
.../AMDGPU/gfx-callable-return-types.ll | 116 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 212 +-
.../AMDGPU/global_atomics_scan_fmax.ll | 102 +-
.../AMDGPU/global_atomics_scan_fmin.ll | 102 +-
.../AMDGPU/global_atomics_scan_fsub.ll | 212 +-
.../AMDGPU/group-image-instructions.ll | 3 +-
.../identical-subrange-spill-infloop.ll | 20 +-
llvm/test/CodeGen/AMDGPU/idot2.ll | 698 +-
llvm/test/CodeGen/AMDGPU/idot4s.ll | 697 +-
llvm/test/CodeGen/AMDGPU/idot4u.ll | 1228 +-
llvm/test/CodeGen/AMDGPU/idot8s.ll | 238 +-
llvm/test/CodeGen/AMDGPU/idot8u.ll | 203 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 8 +-
.../AMDGPU/indirect-call-known-callees.ll | 1 +
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 1 +
.../issue130120-eliminate-frame-index.ll | 13 +-
llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 4 +
.../AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll | 4 +
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll | 11 +-
.../AMDGPU/llvm.amdgcn.dual_intersect_ray.ll | 4 +
.../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 4 +
.../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll | 8 +
.../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll | 5 +-
.../AMDGPU/llvm.amdgcn.intersect_ray.ll | 10 +
.../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 15 +-
...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 34 +-
.../AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll | 20 +
.../CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll | 16 +-
.../llvm.amdgcn.struct.buffer.load.tfe.ll | 20 +
.../AMDGPU/llvm.amdgcn.waitcnt.out.order.ll | 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll | 12 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll | 4 +
llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll | 88 +-
llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 52 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 78 +-
llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 166 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 78 +-
llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 166 +-
llvm/test/CodeGen/AMDGPU/load-select-ptr.ll | 8 +-
llvm/test/CodeGen/AMDGPU/max.i16.ll | 20 +-
llvm/test/CodeGen/AMDGPU/min.ll | 2 +
llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll | 7 +-
llvm/test/CodeGen/AMDGPU/mul.ll | 130 +-
llvm/test/CodeGen/AMDGPU/or.ll | 28 +-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 152 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 123 +-
.../AMDGPU/reassoc-mul-add-1-to-mad.ll | 120 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 13 +-
llvm/test/CodeGen/AMDGPU/rotr.ll | 13 +-
llvm/test/CodeGen/AMDGPU/sdwa-commute.ll | 8 +-
llvm/test/CodeGen/AMDGPU/select.f16.ll | 310 +-
llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 62 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 38 +-
llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 20 +-
llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 62 +-
llvm/test/CodeGen/AMDGPU/v_madak_f16.ll | 4 +
.../test/CodeGen/AMDGPU/vector-reduce-fadd.ll | 4 +-
.../test/CodeGen/AMDGPU/vector-reduce-fmul.ll | 4 +-
.../CodeGen/AMDGPU/vector_shuffle.packed.ll | 207 +-
llvm/test/CodeGen/AMDGPU/vselect.ll | 54 +-
.../AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll | 26 +-
.../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll | 11 +
llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll | 66 +-
llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll | 22 +
llvm/test/CodeGen/AMDGPU/wqm.ll | 95 +-
llvm/test/CodeGen/AMDGPU/xor.ll | 64 +-
107 files changed, 15382 insertions(+), 14266 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 85276bd24bcf4..8b19ab35bc822 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -47,6 +47,12 @@ namespace llvm::AMDGPU {
#include "AMDGPUGenSearchableTables.inc"
} // namespace llvm::AMDGPU
+static cl::opt<bool> DisableDiffBasePtrMemClustering(
+ "amdgpu-disable-diff-baseptr-mem-clustering",
+ cl::desc("Disable clustering memory ops with different base pointers"),
+ cl::init(false),
+ cl::Hidden);
+
// Must be at least 4 to be able to branch over minimum unconditional branch
// code. This is only for making it possible to write reasonably small tests for
// long branches.
@@ -522,6 +528,22 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
return false;
}
+static bool memOpsHaveSameAddrspace(const MachineInstr &MI1,
+ ArrayRef<const MachineOperand *> BaseOps1,
+ const MachineInstr &MI2,
+ ArrayRef<const MachineOperand *> BaseOps2) {
+ // If base is identical, assume identical addrspace
+ if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
+ return true;
+
+ if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
+ return false;
+
+ auto *MO1 = *MI1.memoperands_begin();
+ auto *MO2 = *MI2.memoperands_begin();
+ return MO1->getAddrSpace() == MO2->getAddrSpace();
+}
+
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
ArrayRef<const MachineOperand *> BaseOps1,
const MachineInstr &MI2,
@@ -559,14 +581,21 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
int64_t Offset2, bool OffsetIsScalable2,
unsigned ClusterSize,
unsigned NumBytes) const {
- // If the mem ops (to be clustered) do not have the same base ptr, then they
- // should not be clustered
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
if (!BaseOps1.empty() && !BaseOps2.empty()) {
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
- if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
- return false;
+
+ if (!DisableDiffBasePtrMemClustering) {
+ // Only consider memory ops from same addrspace for clustering
+ if (!memOpsHaveSameAddrspace(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
+ return false;
+ } else {
+ // If the mem ops (to be clustered) do not have the same base ptr, then they
+ // should not be clustered
+ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
+ return false;
+ }
const SIMachineFunctionInfo *MFI =
FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index 27b93872b9f1d..f562d958529d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -8,31 +8,31 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v8, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v9, v[6:7]
-; GFX8-NEXT: flat_load_ushort v10, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v10, v[8:9]
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_ushort v11, v[0:1]
+; GFX8-NEXT: flat_load_ushort v12, v[2:3]
+; GFX8-NEXT: flat_load_ushort v8, v[8:9]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_ushort v11, v[2:3]
-; GFX8-NEXT: flat_load_ushort v12, v[0:1]
; GFX8-NEXT: flat_load_ushort v6, v[6:7]
+; GFX8-NEXT: flat_load_ushort v7, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v7, v8, v11
+; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: v_add_u16_e32 v9, v11, v12
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v8, v9, v12
+; GFX8-NEXT: v_add_u16_e32 v6, v6, v8
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v6, v10, v6
-; GFX8-NEXT: flat_store_short v[4:5], v7
-; GFX8-NEXT: flat_store_short v[0:1], v8
-; GFX8-NEXT: flat_store_short v[2:3], v6
+; GFX8-NEXT: v_add_u16_e32 v7, v10, v7
+; GFX8-NEXT: flat_store_short v[4:5], v9
+; GFX8-NEXT: flat_store_short v[0:1], v6
+; GFX8-NEXT: flat_store_short v[2:3], v7
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -153,28 +153,28 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v12, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v13, v[6:7]
-; GFX8-NEXT: flat_load_ushort v14, v[8:9]
-; GFX8-NEXT: flat_load_ushort v15, v[10:11]
-; GFX8-NEXT: flat_load_ushort v16, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
+; GFX8-NEXT: flat_load_ushort v12, v[6:7]
+; GFX8-NEXT: flat_load_ushort v13, v[8:9]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v14, v[6:7]
+; GFX8-NEXT: flat_load_ushort v15, v[8:9]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v2
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v2
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_ushort v16, v[0:1]
; GFX8-NEXT: flat_load_ushort v17, v[2:3]
-; GFX8-NEXT: flat_load_ushort v18, v[0:1]
-; GFX8-NEXT: flat_load_ushort v19, v[6:7]
-; GFX8-NEXT: flat_load_ushort v20, v[8:9]
+; GFX8-NEXT: flat_load_ushort v18, v[6:7]
+; GFX8-NEXT: flat_load_ushort v19, v[8:9]
; GFX8-NEXT: flat_load_ushort v10, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_ushort v11, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
@@ -184,20 +184,20 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_add_u16_e32 v11, v12, v17
+; GFX8-NEXT: v_add_u16_e32 v16, v16, v17
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u16_e32 v12, v13, v18
+; GFX8-NEXT: v_add_u16_e32 v12, v12, v18
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v13, v14, v19
+; GFX8-NEXT: v_add_u16_e32 v13, v13, v19
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v14, v15, v20
+; GFX8-NEXT: v_add_u16_e32 v10, v14, v10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v10, v16, v10
-; GFX8-NEXT: flat_store_short v[4:5], v11
+; GFX8-NEXT: v_add_u16_e32 v11, v15, v11
+; GFX8-NEXT: flat_store_short v[4:5], v16
; GFX8-NEXT: flat_store_short v[0:1], v12
; GFX8-NEXT: flat_store_short v[2:3], v13
-; GFX8-NEXT: flat_store_short v[6:7], v14
-; GFX8-NEXT: flat_store_short v[8:9], v10
+; GFX8-NEXT: flat_store_short v[6:7], v10
+; GFX8-NEXT: flat_store_short v[8:9], v11
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -513,25 +513,25 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v14, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: flat_load_ushort v1, v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v1, v6, v10
-; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v3, v7, v11
-; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v11, v8, v12
+; GFX8-NEXT: v_add_u16_e32 v2, v6, v10
+; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v10, v7, v11
+; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v14, v8, v12
; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_u16_e32 v12, v9, v13
; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v13, v14, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v2
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v10
-; GFX8-NEXT: v_or_b32_e32 v2, v11, v8
+; GFX8-NEXT: v_add_u16_e32 v13, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v10, v11
+; GFX8-NEXT: v_or_b32_e32 v2, v14, v8
; GFX8-NEXT: v_or_b32_e32 v3, v12, v9
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -604,10 +604,10 @@ define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v14, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dword v15, v[0:1]
+; GFX8-NEXT: flat_load_dword v15, v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u16_e32 v0, v6, v10
; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -663,53 +663,53 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2
-; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_ushort v14, v[14:15]
-; GFX8-NEXT: flat_load_ushort v15, v[16:17]
-; GFX8-NEXT: flat_load_ushort v16, v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u16_e32 v17, v6, v10
-; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0
-; GFX8-NEXT: v_add_u16_e32 v18, v7, v11
-; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u16_e32 v14, v6, v10
+; GFX8-NEXT: v_add_u16_sdwa v15, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0
+; GFX8-NEXT: v_add_u16_e32 v16, v7, v11
+; GFX8-NEXT: v_add_u16_sdwa v17, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u16_e32 v18, v8, v12
+; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v0
+; GFX8-NEXT: v_add_u16_e32 v19, v9, v13
+; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 16, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_ushort v20, v[6:7]
+; GFX8-NEXT: flat_load_ushort v21, v[8:9]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0
-; GFX8-NEXT: flat_load_ushort v2, v[2:3]
-; GFX8-NEXT: flat_load_ushort v3, v[6:7]
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v21, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2
+; GFX8-NEXT: flat_load_ushort v10, v[10:11]
+; GFX8-NEXT: flat_load_ushort v11, v[6:7]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_ushort v22, v[0:1]
+; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u16_e32 v19, v8, v12
-; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4
-; GFX8-NEXT: v_add_u16_e32 v20, v9, v13
-; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v17, v10
-; GFX8-NEXT: v_or_b32_e32 v1, v18, v11
+; GFX8-NEXT: v_or_b32_e32 v0, v14, v15
+; GFX8-NEXT: v_or_b32_e32 v1, v16, v17
+; GFX8-NEXT: v_or_b32_e32 v3, v19, v13
+; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: v_add_u16_e32 v20, v20, v10
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v14, v2, v14
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v15, v3, v15
-; GFX8-NEXT: v_or_b32_e32 v2, v19, v12
-; GFX8-NEXT: v_or_b32_e32 v3, v20, v13
+; GFX8-NEXT: v_add_u16_e32 v21, v21, v11
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v16, v21, v16
+; GFX8-NEXT: v_add_u16_e32 v14, v22, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v18, v12
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: flat_store_short v[6:7], v14
-; GFX8-NEXT: flat_store_short v[8:9], v15
-; GFX8-NEXT: flat_store_short v[10:11], v16
+; GFX8-NEXT: flat_store_short v[6:7], v20
+; GFX8-NEXT: flat_store_short v[8:9], v21
+; GFX8-NEXT: flat_store_short v[10:11], v14
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -794,34 +794,34 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3]
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v2, v6, v10
-; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v10, v7, v11
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u16_e32 v14, v6, v10
+; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v15, v7, v11
; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: v_add_u16_e32 v16, v8, v12
-; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v12, v9, v13
-; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v10, v11
-; GFX8-NEXT: v_or_b32_e32 v2, v16, v8
-; GFX8-NEXT: v_or_b32_e32 v3, v12, v9
+; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v17, v9, v13
+; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[2:3]
+; GFX8-NEXT: v_or_b32_e32 v0, v14, v10
+; GFX8-NEXT: v_or_b32_e32 v1, v15, v11
+; GFX8-NEXT: v_or_b32_e32 v2, v16, v12
+; GFX8-NEXT: v_or_b32_e32 v3, v17, v13
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v8, v6, v14
-; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_u16_e32 v9, v7, v15
-; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v10, v6, v8
+; GFX8-NEXT: v_add_u16_sdwa v6, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_u16_e32 v8, v7, v9
+; GFX8-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4
-; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT: v_or_b32_e32 v7, v9, v7
+; GFX8-NEXT: v_or_b32_e32 v6, v10, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v8, v7
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 86766e2904619..89f896a2b1656 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -288,16 +288,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s10
-; GFX8V4-NEXT: v_mov_b32_e32 v1, s11
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v3, s1
+; GFX8V4-NEXT: v_mov_b32_e32 v1, s11
; GFX8V4-NEXT: v_mov_b32_e32 v2, s0
; GFX8V4-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
@@ -314,16 +314,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s10
-; GFX8V5-NEXT: v_mov_b32_e32 v1, s11
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v3, s1
+; GFX8V5-NEXT: v_mov_b32_e32 v1, s11
; GFX8V5-NEXT: v_mov_b32_e32 v2, s0
; GFX8V5-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 5733cf9a44d32..d98dc6d7f6938 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -645,6 +645,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1030-NEXT: s_clause 0x1
; GFX1030-NEXT: flat_load_dword v0, v[0:1]
; GFX1030-NEXT: flat_load_dword v1, v[2:3]
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
@@ -674,6 +675,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0
+; GFX1013-NEXT: s_clause 0x1
; GFX1013-NEXT: flat_load_dword v0, v[4:5]
; GFX1013-NEXT: flat_load_dword v1, v[2:3]
; GFX1013-NEXT: v_mov_b32_e32 v2, 0
@@ -711,6 +713,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: flat_load_b32 v9, v[0:1]
; GFX11-NEXT: flat_load_b32 v10, v[2:3]
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
@@ -757,6 +760,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1030-NEXT: s_clause 0x1
; GFX1030-NEXT: flat_load_dword v0, v[0:1]
; GFX1030-NEXT: flat_load_dword v1, v[2:3]
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
@@ -783,6 +787,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500
+; GFX1013-NEXT: s_clause 0x1
; GFX1013-NEXT: flat_load_dword v0, v[4:5]
; GFX1013-NEXT: flat_load_dword v1, v[2:3]
; GFX1013-NEXT: v_mov_b32_e32 v2, 0
@@ -816,6 +821,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: flat_load_b32 v6, v[0:1]
; GFX11-NEXT: flat_load_b32 v7, v[2:3]
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index 603eb88c07afb..e6f5b7a295dfa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -77,10 +77,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
; W32-NEXT: s_endpgm
@@ -102,10 +101,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
; W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
; W32-NEXT: s_endpgm
@@ -152,10 +150,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
; W32-NEXT: s_endpgm
@@ -177,10 +174,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
; W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
; W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index 7deaca4ca78b4..e79c398e74f68 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -69,6 +69,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[36:37], v[40:43], off
; W64-NEXT: global_store_b128 v[38:39], v[32:35], off
; W64-NEXT: s_endpgm
@@ -90,6 +91,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
; W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[36:37], v[40:43], off
; W64-NEXT: global_store_b128 v[38:39], v[32:35], off
; W64-NEXT: s_endpgm
@@ -132,6 +134,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[36:37], v[40:43], off
; W64-NEXT: global_store_b128 v[38:39], v[32:35], off
; W64-NEXT: s_endpgm
@@ -153,6 +156,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
; W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[36:37], v[40:43], off
; W64-NEXT: global_store_b128 v[38:39], v[32:35], off
; W64-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index c295a662704e9..7ad04f3a5de64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -126,17 +126,17 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, gv0 at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, gv0 at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, gv1 at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, gv1 at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB1_4: ; %bb2
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index c87c334217b77..5644d3a42b0c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -61,6 +61,7 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX10-NEXT: global_load_dword v4, v3, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -82,6 +83,7 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3]
; GFX11-NEXT: global_load_b32 v5, v2, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -113,6 +115,7 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -134,6 +137,7 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v5, v1, s[2:3]
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
index 7eafe53ea84cf..5dd768796dd7c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
@@ -13,10 +13,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: s_endpgm
@@ -43,10 +42,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: s_endpgm
@@ -71,6 +69,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_endpgm
@@ -95,6 +94,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_endpgm
@@ -121,10 +121,9 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_endpgm
@@ -151,10 +150,9 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off
; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
; GFX12-NEXT: s_endpgm
@@ -181,10 +179,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_endpgm
@@ -211,10 +208,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_endpgm
@@ -241,10 +237,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_endpgm
@@ -271,10 +266,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
index 1e9ef07ba7542..af61f614519c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -23,6 +23,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
@@ -67,6 +68,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
@@ -105,6 +107,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
@@ -143,6 +146,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
@@ -187,6 +191,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
@@ -221,6 +226,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: s_endpgm
@@ -247,6 +253,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
; GFX12-NEXT: s_endpgm
@@ -283,6 +290,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
@@ -327,6 +335,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
@@ -371,6 +380,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
@@ -415,6 +425,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 50d20e9b0e4d7..443e6e6402be2 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -134,6 +134,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -148,6 +149,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 3160e38df5e3f..5fe362616e67d 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -513,16 +513,16 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-LABEL: introduced_copy_to_sgpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc
+; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18
; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18
-; GFX908-NEXT: s_mov_b32 s12, 0
-; GFX908-NEXT: s_mov_b32 s9, s12
+; GFX908-NEXT: s_mov_b32 s8, 0
+; GFX908-NEXT: s_mov_b32 s13, s8
+; GFX908-NEXT: v_mov_b32_e32 v19, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX908-NEXT: s_sub_i32 s1, 0, s7
; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s0
-; GFX908-NEXT: v_mov_b32_e32 v19, 0
; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, 0
; GFX908-NEXT: v_mov_b32_e32 v1, 0
@@ -542,14 +542,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_cselect_b32 s2, s6, s2
; GFX908-NEXT: s_add_i32 s3, s1, 1
; GFX908-NEXT: s_cmp_ge_u32 s2, s7
-; GFX908-NEXT: s_cselect_b32 s8, s3, s1
+; GFX908-NEXT: s_cselect_b32 s12, s3, s1
; GFX908-NEXT: s_lshr_b32 s2, s0, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2
; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GFX908-NEXT: s_or_b32 s14, s14, 28
-; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
+; GFX908-NEXT: s_lshl_b64 s[16:17], s[12:13], 5
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s2, v16
; GFX908-NEXT: s_and_b32 s2, 0xffff, s2
@@ -573,15 +573,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; GFX908-NEXT: s_mov_b32 s13, s12
+; GFX908-NEXT: s_mov_b32 s9, s8
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v4, s12
+; GFX908-NEXT: v_mov_b32_e32 v4, s8
; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6
-; GFX908-NEXT: v_mov_b32_e32 v6, s12
-; GFX908-NEXT: v_mov_b32_e32 v8, s12
-; GFX908-NEXT: v_mov_b32_e32 v5, s13
-; GFX908-NEXT: v_mov_b32_e32 v7, s13
-; GFX908-NEXT: v_mov_b32_e32 v9, s13
+; GFX908-NEXT: v_mov_b32_e32 v6, s8
+; GFX908-NEXT: v_mov_b32_e32 v8, s8
+; GFX908-NEXT: v_mov_b32_e32 v5, s9
+; GFX908-NEXT: v_mov_b32_e32 v7, s9
+; GFX908-NEXT: v_mov_b32_e32 v9, s9
; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
; GFX908-NEXT: v_mov_b32_e32 v11, v5
; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
@@ -667,7 +667,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: ; %bb.11: ; %bb12
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_add_u32 s10, s10, s8
+; GFX908-NEXT: s_add_u32 s10, s10, s12
; GFX908-NEXT: s_addc_u32 s11, s11, 0
; GFX908-NEXT: s_add_u32 s14, s14, s16
; GFX908-NEXT: s_addc_u32 s15, s15, s17
@@ -679,15 +679,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-LABEL: introduced_copy_to_sgpr:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc
+; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18
-; GFX90A-NEXT: s_mov_b32 s12, 0
-; GFX90A-NEXT: s_mov_b32 s9, s12
+; GFX90A-NEXT: s_mov_b32 s8, 0
+; GFX90A-NEXT: s_mov_b32 s13, s8
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX90A-NEXT: s_sub_i32 s1, 0, s7
-; GFX90A-NEXT: v_mov_b32_e32 v19, 0
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
@@ -707,14 +707,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_cselect_b32 s2, s6, s2
; GFX90A-NEXT: s_add_i32 s3, s1, 1
; GFX90A-NEXT: s_cmp_ge_u32 s2, s7
-; GFX90A-NEXT: s_cselect_b32 s8, s3, s1
+; GFX90A-NEXT: s_cselect_b32 s12, s3, s1
; GFX90A-NEXT: s_lshr_b32 s2, s0, 16
; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s2
; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GFX90A-NEXT: s_or_b32 s14, s14, 28
-; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
+; GFX90A-NEXT: s_lshl_b64 s[16:17], s[12:13], 5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s2, v18
; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2
@@ -738,12 +738,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; GFX90A-NEXT: s_mov_b32 s13, s12
+; GFX90A-NEXT: s_mov_b32 s9, s8
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15]
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -821,7 +821,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: ; %bb.11: ; %bb12
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT: s_add_u32 s10, s10, s8
+; GFX90A-NEXT: s_add_u32 s10, s10, s12
; GFX90A-NEXT: s_addc_u32 s11, s11, 0
; GFX90A-NEXT: s_add_u32 s14, s14, s16
; GFX90A-NEXT: s_addc_u32 s15, s15, s17
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index fa73ef0b0ec4c..983c415442a66 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6289,9 +6289,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
@@ -6355,7 +6355,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -6436,9 +6436,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
@@ -6805,11 +6804,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
@@ -6876,8 +6875,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -7086,9 +7085,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v31
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
@@ -7478,11 +7476,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v82
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v81
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v80
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v69
@@ -16419,94 +16418,94 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:128
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:224
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v166, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
@@ -16552,87 +16551,88 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v49.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v53.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v70.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v70.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v52.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v99
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v84.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v53.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v48.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v39.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v31.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v31.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v160.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v166.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v166.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v167.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -16736,143 +16736,143 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v82.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v81.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v55.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v96.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v83.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v85.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v70.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v71.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v50.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v36.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v54.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v49.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
@@ -16901,39 +16901,39 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
@@ -16971,36 +16971,36 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
@@ -17144,15 +17144,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v82.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
@@ -17166,67 +17166,67 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v81.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v85.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v81.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.h, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v87.l, v17.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v64.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v55.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.h, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v54.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v96.l, v18.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v83.l, v17.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v83.h, v17.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
@@ -17234,13 +17234,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v84.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v84.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v38.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
@@ -17249,15 +17249,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v69.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v69.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v23.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
@@ -17265,68 +17265,68 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v50.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v71.l, v24.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v65.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v66.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v65.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v66.h, v23.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v67.l, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v52.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v54.l, v29.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v52.h, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3
@@ -17336,14 +17336,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v39.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v48.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v48.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v49.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v49.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
@@ -17353,12 +17353,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -17366,58 +17366,64 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:616
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:612
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:608
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:604
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:600
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:596
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:588
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:584
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:580
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:576
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:572
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:568
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:564
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:560
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:556
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:552
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:548
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:544
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:540
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:536
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:532
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:528
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:524
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:520
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:516
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:512
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:508
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:504
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:500
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:496
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:492
+; GFX11-FAKE16-NEXT: s_clause 0x18
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:488
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:484
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:480
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:472
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:468
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:464
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:392
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
@@ -17427,94 +17433,94 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384
; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376
; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368
; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v112, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:72
; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:224
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v139, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v140, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_u16 v141, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_u16 v142, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_u16 v143, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_u16 v152, off, s32 offset:328
; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76
@@ -17543,85 +17549,89 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v87
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v97
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v112
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v113
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v114
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v115
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v116
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v130
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v131
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v132
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v133
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v134
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v146
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v20
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v18
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v148
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v16
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v149
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v149, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v150
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v133, 8, v12
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v178
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v177, 8, v177
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v176
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v167
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v137
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v136
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v138
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v139
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v140
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v141
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v142
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v143
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v152
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -17716,12 +17726,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63
@@ -17747,26 +17757,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v119
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v117
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v129
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v183
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v166
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v177
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v178
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -17782,26 +17792,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v83
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v102
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v100
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v146
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v148
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v150
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v130
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v133
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
@@ -17817,26 +17827,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v113
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v115
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v116
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v99
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
@@ -17880,39 +17890,39 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -17950,36 +17960,36 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4
@@ -18121,12 +18131,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v165, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v164, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v163, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v161, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v179, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
@@ -18172,16 +18182,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v162, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v135, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v128, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v119, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v117, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v145, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v144, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v129, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19
@@ -18192,16 +18202,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v181, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v182, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v183, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v40, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v41, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v166, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v167, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v176, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v177, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v178, v26
; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19
@@ -18227,16 +18237,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v118, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v84, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v103, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v102, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v100, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v71, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
@@ -18247,16 +18257,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v146, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v147, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v148, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v149, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v150, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v130, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v131, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v132, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v133, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v134, v31
; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22
; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23
; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24
@@ -18282,13 +18292,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v69, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v81, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v70, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3
@@ -18302,16 +18312,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v112, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v113, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v114, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v115, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v116, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v87, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v97, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v98, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v99, v36
; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27
; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28
; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29
@@ -18340,58 +18350,64 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:464
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:468
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:472
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:480
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:484
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:488
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:492
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:496
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:500
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:504
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:508
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:512
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:516
+; GFX11-FAKE16-NEXT: s_clause 0x18
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:520
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:524
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:528
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:532
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:536
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:540
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:544
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:548
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:552
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:556
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:560
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:564
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:568
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:572
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:576
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:580
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:584
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:588
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:596
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:600
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:604
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:608
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:612
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:616
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
@@ -21660,14 +21676,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
@@ -22070,26 +22086,26 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
@@ -22509,30 +22525,30 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v33, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v36
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -22601,47 +22617,43 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:336
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:320
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
@@ -22651,88 +22663,88 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:208
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:28
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
@@ -22750,71 +22762,64 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v18
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v181, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v31
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v96
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v97
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v114
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v132
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v100
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v133
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v117
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v148
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v129
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v149
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v95
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v104
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v105
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 8, v106
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v107
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54
@@ -22881,153 +22886,153 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v58
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v59
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v62
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v47
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v182
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v183
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v165
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v102
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v147
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v150
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v151
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v85
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114
@@ -23036,8 +23041,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v83
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116
@@ -23045,30 +23050,30 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v96
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v65
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v98
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v69
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -23240,10 +23245,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v57
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v43
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v160
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2
@@ -23261,9 +23266,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v62, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v47, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v56, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -23279,14 +23284,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v180
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v179
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v164
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v163
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v162
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -23297,16 +23301,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v128
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v102
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v60, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v61, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v44, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v45, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v46, v15
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
@@ -23317,9 +23321,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v41, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v176, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v177, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -23334,14 +23338,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v144
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v135
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v161
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v112
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -23352,16 +23356,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v134
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v118
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v181, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v183, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v40, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v165, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v166, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v167, v20
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
@@ -23372,9 +23376,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v151, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v132, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v133, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -23389,14 +23393,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v87
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v71
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v68
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -23407,14 +23411,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v148, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v150, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v129, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v130, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v131, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -23436,16 +23440,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v85
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v80
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v81
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v65
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v64
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v69
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -23459,11 +23463,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v96, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v97, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v98, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v99, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v100, v32
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -23492,47 +23496,43 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
; GFX11-FAKE16-NEXT: .LBB15_3: ; %end
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:444
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:460
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB15_4:
@@ -44019,9 +44019,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
@@ -44085,7 +44085,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -44164,8 +44164,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19
; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v24, 1.0, v24 :: v_dual_add_f32 v31, 1.0, v31
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 1.0, v32 :: v_dual_add_f32 v23, 1.0, v23
; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v15, 1.0, v15
; GFX11-TRUE16-NEXT: v_dual_add_f32 v26, 1.0, v26 :: v_dual_add_f32 v29, 1.0, v29
@@ -44518,11 +44518,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
@@ -44589,8 +44589,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -44797,8 +44797,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-FAKE16-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19
; GFX11-FAKE16-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_dual_add_f32 v24, 1.0, v24 :: v_dual_add_f32 v31, 1.0, v31
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 1.0, v32 :: v_dual_add_f32 v23, 1.0, v23
; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v15, 1.0, v15
; GFX11-FAKE16-NEXT: v_dual_add_f32 v26, 1.0, v26 :: v_dual_add_f32 v29, 1.0, v29
@@ -45174,11 +45175,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v82
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v81
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v80
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v69
@@ -55136,94 +55138,94 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:128
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:224
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v166, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
@@ -55269,87 +55271,88 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v49.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v53.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v70.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v70.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v52.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v99
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v84.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v53.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v48.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v39.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v31.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v31.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v160.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v166.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v166.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v167.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -55453,143 +55456,143 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v82.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v81.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v55.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v96.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v83.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v85.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v70.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v71.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v50.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v36.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v54.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v49.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
@@ -55618,39 +55621,39 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
@@ -55688,36 +55691,36 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2
; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true
@@ -55861,15 +55864,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v82.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
@@ -55883,67 +55886,67 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v81.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v85.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v81.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.h, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v87.l, v17.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v64.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v55.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.h, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v54.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v96.l, v18.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v83.l, v17.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v83.h, v17.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
@@ -55951,13 +55954,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v84.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v84.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v38.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
@@ -55966,15 +55969,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v69.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v69.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v23.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
@@ -55982,68 +55985,68 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v50.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v71.l, v24.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v65.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v66.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v65.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v66.h, v23.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v67.l, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v52.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v54.l, v29.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v52.h, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3
@@ -56053,14 +56056,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v39.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v48.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v48.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v49.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v49.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
@@ -56070,12 +56073,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -56083,58 +56086,64 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:616
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:612
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:608
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:604
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:600
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:596
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:588
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:584
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:580
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:576
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:572
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:568
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:564
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:560
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:556
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:552
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:548
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:544
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:540
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:536
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:532
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:528
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:524
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:520
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:516
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:512
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:508
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:504
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:500
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:496
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:492
+; GFX11-FAKE16-NEXT: s_clause 0x18
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:488
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:484
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:480
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:472
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:468
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:464
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:392
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
@@ -56144,94 +56153,94 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384
; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376
; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368
; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v112, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:72
; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:224
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v139, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v140, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_u16 v141, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_u16 v142, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_u16 v143, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_u16 v152, off, s32 offset:328
; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76
@@ -56260,85 +56269,89 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v87
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v97
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v112
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v113
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v114
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v115
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v116
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v130
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v131
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v132
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v133
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v134
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v146
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v20
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v18
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v148
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v16
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v149
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v149, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v150
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v133, 8, v12
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v178
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v177, 8, v177
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v176
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v167
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v137
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v136
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v138
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v139
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v140
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v141
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v142
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v143
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v152
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -56433,12 +56446,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63
@@ -56464,26 +56477,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v119
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v117
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v129
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v183
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v166
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v177
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v178
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -56499,26 +56512,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v83
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v102
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v100
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v146
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v148
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v150
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v130
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v133
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
@@ -56534,26 +56547,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v113
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v115
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v116
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v99
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
@@ -56597,39 +56610,39 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -56667,36 +56680,36 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
; GFX11-FAKE16-NEXT: .LBB38_2: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_4
@@ -56838,12 +56851,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v165, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v164, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v163, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v161, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v179, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
@@ -56889,16 +56902,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v162, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v135, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v128, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v119, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v117, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v145, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v144, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v129, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19
@@ -56909,16 +56922,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v181, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v182, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v183, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v40, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v41, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v166, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v167, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v176, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v177, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v178, v26
; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19
@@ -56944,16 +56957,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v118, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v84, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v103, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v102, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v100, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v71, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
@@ -56964,16 +56977,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v146, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v147, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v148, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v149, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v150, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v130, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v131, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v132, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v133, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v134, v31
; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22
; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23
; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24
@@ -56999,13 +57012,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v69, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v81, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v70, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3
@@ -57019,16 +57032,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v112, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v113, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v114, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v115, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v116, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v87, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v97, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v98, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v99, v36
; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27
; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28
; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29
@@ -57057,58 +57070,64 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: .LBB38_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:464
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:468
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:472
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:480
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:484
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:488
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:492
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:496
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:500
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:504
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:508
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:512
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:516
+; GFX11-FAKE16-NEXT: s_clause 0x18
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:520
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:524
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:528
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:532
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:536
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:540
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:544
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:548
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:552
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:556
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:560
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:564
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:568
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:572
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:576
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:580
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:584
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:588
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:596
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:600
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:604
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:608
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:612
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:616
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
@@ -60377,14 +60396,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
@@ -60787,26 +60806,26 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
@@ -61226,30 +61245,30 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v33, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v36
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -61318,47 +61337,43 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:336
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:320
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
@@ -61368,88 +61383,88 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:208
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:28
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
@@ -61467,71 +61482,64 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v18
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v181, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v31
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v96
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v97
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v114
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v132
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v100
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v133
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v117
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v148
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v129
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v149
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v95
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v104
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v105
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 8, v106
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v107
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54
@@ -61598,153 +61606,153 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v58
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v59
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v62
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v47
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v182
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v183
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v165
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v102
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v147
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v150
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v151
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v85
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114
@@ -61753,8 +61761,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v83
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116
@@ -61762,30 +61770,30 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v96
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v65
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v98
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v69
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -61957,10 +61965,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v57
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v43
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v160
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2
@@ -61978,9 +61986,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v62, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v47, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v56, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -61996,14 +62004,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v180
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v179
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v164
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v163
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v162
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -62014,16 +62021,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v128
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v102
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v60, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v61, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v44, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v45, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v46, v15
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
@@ -62034,9 +62041,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v41, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v176, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v177, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -62051,14 +62058,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v144
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v135
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v161
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v112
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -62069,16 +62076,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v134
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v118
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v181, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v183, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v40, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v165, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v166, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v167, v20
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
@@ -62089,9 +62096,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v151, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v132, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v133, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -62106,14 +62113,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v87
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v71
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v68
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -62124,14 +62131,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v148, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v150, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v129, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v130, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v131, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -62153,16 +62160,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v85
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v80
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v81
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v65
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v64
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v69
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -62176,11 +62183,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v96, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v97, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v98, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v99, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v100, v32
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -62209,47 +62216,43 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
; GFX11-FAKE16-NEXT: .LBB39_3: ; %end
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:444
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:460
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB39_4:
@@ -81719,9 +81722,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
@@ -81785,7 +81788,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -81897,14 +81900,13 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v28, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v29, vcc_lo, v29, 3
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v30, null, 0, v30, vcc_lo
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v31, vcc_lo, v31, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v32, null, 0, v32, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v23, vcc_lo, v23, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
@@ -82243,11 +82245,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
@@ -82314,8 +82316,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -82555,14 +82557,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v28, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v29, vcc_lo, v29, 3
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v30, null, 0, v30, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_add_co_u32 v31, vcc_lo, v31, 3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v32, null, 0, v32, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v23, vcc_lo, v23, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v24, vcc_lo
; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18]
; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
@@ -82924,11 +82926,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v82
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v81
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v80
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v69
@@ -91872,94 +91875,94 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:128
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:224
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v166, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
@@ -92005,87 +92008,88 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v49.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v53.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v70.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v70.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v52.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v99
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v84.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v53.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v48.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v39.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v31.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v31.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v160.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v166.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v166.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v167.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -92189,143 +92193,143 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v82.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v81.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v55.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v96.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v83.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v85.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v70.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v71.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v50.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v36.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v54.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v49.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
@@ -92354,39 +92358,39 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
@@ -92424,36 +92428,36 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
@@ -92597,15 +92601,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v82.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
@@ -92619,67 +92623,67 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v81.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v85.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v81.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.h, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v87.l, v17.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v64.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v55.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.h, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v54.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v96.l, v18.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v83.l, v17.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v83.h, v17.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
@@ -92687,13 +92691,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v84.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v84.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v38.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
@@ -92702,15 +92706,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v69.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v69.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v23.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
@@ -92718,68 +92722,68 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v50.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v71.l, v24.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v65.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v66.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v65.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v66.h, v23.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v67.l, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v52.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v54.l, v29.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v52.h, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3
@@ -92789,14 +92793,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v39.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v48.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v48.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v49.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v49.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
@@ -92806,12 +92810,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -92819,58 +92823,64 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:616
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:612
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:608
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:604
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:600
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:596
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:588
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:584
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:580
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:576
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:572
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:568
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:564
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:560
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:556
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:552
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:548
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:544
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:540
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:536
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:532
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:528
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:524
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:520
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:516
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:512
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:508
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:504
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:500
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:496
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:492
+; GFX11-FAKE16-NEXT: s_clause 0x18
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:488
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:484
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:480
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:472
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:468
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:464
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:392
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
@@ -92880,94 +92890,94 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384
; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376
; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368
; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v112, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:72
; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:224
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v139, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v140, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_u16 v141, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_u16 v142, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_u16 v143, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_u16 v152, off, s32 offset:328
; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76
@@ -92996,85 +93006,89 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v87
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v97
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v112
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v113
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v114
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v115
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v116
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v130
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v131
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v132
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v133
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v134
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v146
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v20
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v18
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v148
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v16
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v149
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v149, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v150
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v133, 8, v12
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v178
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v177, 8, v177
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v176
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v167
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v137
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v136
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v138
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v139
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v140
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v141
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v142
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v143
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v152
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -93169,12 +93183,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63
@@ -93200,26 +93214,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v119
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v117
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v129
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v183
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v166
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v177
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v178
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -93235,26 +93249,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v83
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v102
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v100
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v146
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v148
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v150
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v130
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v133
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
@@ -93270,26 +93284,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v113
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v115
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v116
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v99
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
@@ -93333,39 +93347,39 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -93403,36 +93417,36 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
; GFX11-FAKE16-NEXT: .LBB58_2: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_4
@@ -93574,12 +93588,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v165, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v164, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v163, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v161, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v179, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
@@ -93625,16 +93639,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v162, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v135, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v128, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v119, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v117, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v145, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v144, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v129, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19
@@ -93645,16 +93659,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v181, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v182, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v183, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v40, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v41, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v166, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v167, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v176, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v177, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v178, v26
; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19
@@ -93680,16 +93694,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v118, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v84, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v103, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v102, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v100, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v71, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
@@ -93700,16 +93714,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v146, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v147, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v148, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v149, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v150, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v130, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v131, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v132, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v133, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v134, v31
; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22
; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23
; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24
@@ -93735,13 +93749,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v69, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v81, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v70, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3
@@ -93755,16 +93769,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v112, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v113, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v114, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v115, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v116, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v87, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v97, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v98, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v99, v36
; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27
; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28
; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29
@@ -93793,58 +93807,64 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: .LBB58_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:464
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:468
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:472
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:480
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:484
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:488
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:492
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:496
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:500
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:504
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:508
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:512
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:516
+; GFX11-FAKE16-NEXT: s_clause 0x18
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:520
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:524
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:528
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:532
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:536
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:540
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:544
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:548
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:552
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:556
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:560
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:564
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:568
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:572
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:576
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:580
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:584
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:588
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:596
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:600
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:604
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:608
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:612
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:616
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
@@ -97113,14 +97133,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
@@ -97523,26 +97543,26 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
@@ -97962,30 +97982,30 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v33, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v36
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -98054,47 +98074,43 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:336
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:320
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
@@ -98104,88 +98120,88 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:208
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:28
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
@@ -98203,71 +98219,64 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v18
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v181, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v31
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v96
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v97
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v114
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v132
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v100
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v133
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v117
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v148
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v129
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v149
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v95
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v104
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v105
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 8, v106
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v107
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54
@@ -98334,153 +98343,153 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v58
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v59
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v62
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v47
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v182
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v183
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v165
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v102
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v147
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v150
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v151
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v85
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114
@@ -98489,8 +98498,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v83
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116
@@ -98498,30 +98507,30 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v96
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v65
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v98
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v69
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -98693,10 +98702,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v57
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v43
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v160
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2
@@ -98714,9 +98723,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v62, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v47, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v56, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -98732,14 +98741,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v180
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v179
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v164
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v163
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v162
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -98750,16 +98758,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v128
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v102
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v60, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v61, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v44, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v45, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v46, v15
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
@@ -98770,9 +98778,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v41, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v176, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v177, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -98787,14 +98795,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v144
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v135
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v161
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v112
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -98805,16 +98813,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v134
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v118
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v181, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v183, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v40, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v165, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v166, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v167, v20
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
@@ -98825,9 +98833,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v151, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v132, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v133, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -98842,14 +98850,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v87
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v71
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v68
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -98860,14 +98868,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v148, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v150, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v129, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v130, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v131, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -98889,16 +98897,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v85
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v80
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v81
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v65
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v64
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v69
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -98912,11 +98920,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v96, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v97, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v98, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v99, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v100, v32
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -98945,47 +98953,43 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
; GFX11-FAKE16-NEXT: .LBB59_3: ; %end
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:444
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:460
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB59_4:
@@ -117466,9 +117470,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
@@ -117532,7 +117536,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -117965,11 +117969,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
@@ -118036,8 +118040,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -118621,11 +118625,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v82
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v81
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v80
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v69
@@ -128551,94 +128556,94 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:128
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:224
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v166, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
@@ -128684,87 +128689,88 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v49.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v53.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v70.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v70.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v52.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v99
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v84.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v53.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v48.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v39.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v31.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v31.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v160.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v166.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v166.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v167.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -128868,143 +128874,143 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v82.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v81.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v55.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v96.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v83.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v85.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v70.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v71.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v50.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v36.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v54.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v49.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
@@ -129033,39 +129039,39 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
@@ -129103,36 +129109,36 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2
; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true
@@ -129276,15 +129282,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v82.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
@@ -129298,67 +129304,67 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v81.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v85.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v81.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.h, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v87.l, v17.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v64.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v55.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.h, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v54.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v96.l, v18.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v83.l, v17.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v83.h, v17.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
@@ -129366,13 +129372,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v84.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v84.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v38.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
@@ -129381,15 +129387,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v69.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v69.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v23.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
@@ -129397,68 +129403,68 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v50.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v71.l, v24.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v65.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v66.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v65.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v66.h, v23.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v67.l, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v52.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v54.l, v29.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v52.h, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3
@@ -129468,14 +129474,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v39.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v48.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v48.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v49.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v49.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
@@ -129485,12 +129491,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -129498,58 +129504,64 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:616
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:612
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:608
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:604
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:600
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:596
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:588
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:584
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:580
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:576
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:572
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:568
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:564
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:560
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:556
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:552
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:548
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:544
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:540
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:536
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:532
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:528
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:524
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:520
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:516
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:512
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:508
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:504
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:500
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:496
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:492
+; GFX11-FAKE16-NEXT: s_clause 0x18
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:488
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:484
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:480
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:472
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:468
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:464
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:392
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
@@ -129559,94 +129571,94 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384
; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376
; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368
; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v112, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:72
; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:224
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v139, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v140, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_u16 v141, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_u16 v142, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_u16 v143, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_u16 v152, off, s32 offset:328
; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76
@@ -129675,85 +129687,89 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v87
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v97
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v112
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v113
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v114
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v115
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v116
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v130
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v131
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v132
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v133
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v134
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v146
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v20
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v18
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v148
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v16
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v149
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v149, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v150
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v133, 8, v12
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v178
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v177, 8, v177
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v176
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v167
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v137
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v136
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v138
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v139
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v140
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v141
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v142
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v143
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v152
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -129848,12 +129864,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63
@@ -129879,26 +129895,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v119
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v117
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v129
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v183
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v166
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v177
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v178
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -129914,26 +129930,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v83
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v102
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v100
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v146
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v148
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v150
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v130
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v133
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
@@ -129949,26 +129965,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v113
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v115
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v116
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v99
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
@@ -130012,39 +130028,39 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -130082,36 +130098,36 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
; GFX11-FAKE16-NEXT: .LBB74_2: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_4
@@ -130253,12 +130269,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v165, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v164, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v163, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v161, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v179, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
@@ -130304,16 +130320,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v162, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v135, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v128, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v119, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v117, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v145, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v144, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v129, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19
@@ -130324,16 +130340,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v181, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v182, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v183, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v40, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v41, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v166, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v167, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v176, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v177, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v178, v26
; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19
@@ -130359,16 +130375,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v118, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v84, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v103, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v102, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v100, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v71, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
@@ -130379,16 +130395,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v146, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v147, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v148, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v149, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v150, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v130, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v131, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v132, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v133, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v134, v31
; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22
; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23
; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24
@@ -130414,13 +130430,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v69, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v81, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v70, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3
@@ -130434,16 +130450,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v112, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v113, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v114, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v115, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v116, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v87, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v97, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v98, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v99, v36
; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27
; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28
; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29
@@ -130472,58 +130488,64 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: .LBB74_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:464
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:468
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:472
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:480
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:484
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:488
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:492
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:496
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:500
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:504
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:508
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:512
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:516
+; GFX11-FAKE16-NEXT: s_clause 0x18
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:520
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:524
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:528
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:532
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:536
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:540
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:544
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:548
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:552
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:556
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:560
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:564
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:568
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:572
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:576
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:580
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:584
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:588
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:592
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:596
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:600
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:604
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:608
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:612
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:616
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
@@ -133792,14 +133814,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
@@ -134202,26 +134224,26 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
@@ -134641,30 +134663,30 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v33, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v36
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -134733,47 +134755,43 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:460
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:444
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:336
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:320
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
@@ -134783,88 +134801,88 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:208
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:28
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
@@ -134882,71 +134900,64 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v18
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v181, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v31
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v96
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v97
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v114
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v132
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v100
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v133
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v117
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v148
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v129
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v149
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v95
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v104
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v105
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 8, v106
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v107
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB75_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54
@@ -135013,153 +135024,153 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v58
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v59
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v62
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v47
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v182
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v183
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v165
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v102
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v147
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v150
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v151
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v85
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114
@@ -135168,8 +135179,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v83
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116
@@ -135177,30 +135188,30 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v96
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v65
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v98
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v69
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -135372,10 +135383,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v57
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v43
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v160
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2
@@ -135393,9 +135404,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v62, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v47, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v56, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -135411,14 +135422,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v180
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v179
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v164
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v163
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v162
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -135429,16 +135439,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v128
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v102
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v60, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v61, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v44, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v45, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v46, v15
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
@@ -135449,9 +135459,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v41, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v176, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v177, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -135466,14 +135476,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v144
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v135
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v161
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v112
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -135484,16 +135494,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v134
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v118
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v181, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v183, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v40, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v165, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v166, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v167, v20
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
@@ -135504,9 +135514,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v151, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v132, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v133, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -135521,14 +135531,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v87
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v71
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v68
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -135539,14 +135549,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v148, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v150, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v129, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v130, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v131, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -135568,16 +135578,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v85
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v80
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v81
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v65
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v64
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v69
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -135591,11 +135601,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v96, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v97, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v98, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v99, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v100, v32
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -135624,47 +135634,43 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
; GFX11-FAKE16-NEXT: .LBB75_3: ; %end
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:440
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:444
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:448
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:452
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:456
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:460
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB75_4:
@@ -155300,223 +155306,223 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:252
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:128
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:156
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:76
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v112.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v151.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v100.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v103.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v114.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v114.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v115.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v115.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v117.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v117.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v132.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v128.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v133.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v134.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v147.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v31.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -155529,101 +155535,101 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB88_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v53.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v99.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v103.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v115.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v112.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v113.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h
@@ -155658,100 +155664,100 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h
; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l
; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
@@ -155789,11 +155795,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB88_2
; GFX11-TRUE16-NEXT: .LBB88_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -155808,16 +155814,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v113.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v100.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l
@@ -155827,10 +155833,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v102.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v101.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -155841,15 +155847,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v96.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -155865,13 +155871,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v85.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -155886,19 +155890,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v71.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l
@@ -155908,12 +155909,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -155924,16 +155923,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -155942,7 +155940,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v113.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
@@ -155952,103 +155950,103 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v70.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v112.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v115.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v115.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v114.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v69.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v114.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v100.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v103.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v64.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v101.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v55.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v84.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v87.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v86.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h
@@ -156124,207 +156122,204 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v127, off, s32 offset:384
; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64
; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:224
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_u16 v120, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_u16 v123, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_u16 v121, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_u16 v122, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:20
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v86, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v151, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v29
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v40
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v47
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v56
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v127
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v166
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v176
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v179
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v182
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v183
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v42
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v43
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v46
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v42, 8, v58
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v59
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v57
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v75
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v74
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v22
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v63
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v72
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v91
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v88
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v89
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v18
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v90
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v107
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v106
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v94
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v105
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v120
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v123
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v121
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v122
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -156335,10 +156330,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v96
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
@@ -156350,16 +156345,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v119
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v115
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v134
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v102
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v160
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v151
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v162
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v161
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100
@@ -156368,70 +156363,70 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v100
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v129
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v165
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v178
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v179
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v166
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v183
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v43
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v42
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v130
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v147
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v117
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v46
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v47
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v58
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v59
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v72
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v63
; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v112
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v71
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v164
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v74
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v57
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v75
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92
@@ -156441,16 +156436,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v177
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v83
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v97
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v44
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v41
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v45
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v99
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104
@@ -156466,16 +156461,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v113
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v114
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v62
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v73
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v76
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v133
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v128
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111
; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106
; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122
@@ -156507,94 +156502,94 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92
@@ -156623,11 +156618,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB88_4
; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v133, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v128, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v76, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v118, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v73, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -156639,15 +156634,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v61, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v128, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v114, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v62, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v113, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -156656,38 +156651,38 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v113, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v45, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v99, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v44, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v41, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v114, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v97, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v84, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v83, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v177, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -156696,142 +156691,134 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v70, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v167, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v164, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v82, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v163, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v84, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v71, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v144, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v74, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v112, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v71, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v117, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v80, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v68, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v72, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v64, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v147, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v148, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v58, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v56, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v47, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v46, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v130, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v145, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v132, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v135, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v43, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v42, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v183, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v116, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v103, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v100, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v179, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v178, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v165, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0
@@ -156845,13 +156832,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v162, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v151, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2
@@ -156863,15 +156850,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v119, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v102, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3
@@ -156881,11 +156868,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v87, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v86, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v81, v32
; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35
; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33
; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0
@@ -156903,28 +156890,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v101, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v117, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v112, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v80, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v69, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v68, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v67, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v71, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v84, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v82, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v83, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v97, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v114, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v99, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v85, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v113, v29, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v30, v128, v30, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v31, v118, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB88_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_clause 0x1f
@@ -160865,85 +160852,85 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:144
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
@@ -161051,7 +161038,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -161060,89 +161047,89 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v129
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v118
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v133
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v162
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
@@ -161170,90 +161157,90 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v58
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v62
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v73
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v79
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v78
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v77
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v92
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
@@ -161305,98 +161292,98 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v93
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v91
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v183
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v92
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v88
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v79
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v78
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v76
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v75
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v72
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v180, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v40, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v62
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v183, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v61
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v46, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v58
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v59
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v45
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v47
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v44
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v41
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v182
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
@@ -161421,84 +161408,84 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v162
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v160
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v161
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v147
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v163
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v130
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v128
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v144
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v119
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
@@ -161628,26 +161615,26 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v131
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v128, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v118, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v160, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
@@ -161657,11 +161644,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v116, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v43, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v183, 16, v33
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v180, 16, v29
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35
; GFX11-TRUE16-NEXT: .LBB89_3: ; %end
@@ -161706,38 +161693,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
+; GFX11-FAKE16-NEXT: s_clause 0x1d
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
@@ -161747,170 +161733,163 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:204
; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:260
+; GFX11-FAKE16-NEXT: s_clause 0xf
; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:268
; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:28
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v103, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v4
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v18
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v163, 8, v30
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v183
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v60
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v74
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v180
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v40
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v43
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v41
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v167
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v45
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v176
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v57
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v177
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v47
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v63
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v73
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v72
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v77
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v78
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v93
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v2
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB89_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
@@ -161964,16 +161943,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v85
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53
@@ -161981,158 +161960,158 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v102
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v129
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v116
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v80
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v145
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v132
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v160
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v101
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v161
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v151
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v166
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v162
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v163
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v113
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v178
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v180
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v128
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v183
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v134
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v47
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v44
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v59
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v76
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v75
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93
@@ -162185,14 +162164,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v75
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v76
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
@@ -162200,148 +162178,140 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v61
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v59
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v62
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v58
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v44
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v150
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v148, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v73, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v147, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v57, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v46, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v43, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v130
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v128
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v40, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v183, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v115
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v180, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v113
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v100
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v178, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v101
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v100, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v98, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v3
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v151
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v97
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v96, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v163, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v82
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v161, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v80
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
@@ -162350,71 +162320,70 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v145, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v131, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v132, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v129, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v116, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v112, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v103, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v102, v7
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v99, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v87, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v83, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v84, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v81, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v70, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v133, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7
@@ -162458,71 +162427,70 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v98
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v86, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v101, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v114, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v117
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v100, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v147, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v118, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB89_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
+; GFX11-FAKE16-NEXT: s_clause 0x1d
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:436
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB89_4:
@@ -167751,9 +167719,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:12
; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
@@ -167865,7 +167833,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -168231,7 +168199,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff
; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32
; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -168260,7 +168228,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1
@@ -168903,7 +168870,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v128.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v116.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v119.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v114.h
@@ -168917,6 +168883,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
@@ -169002,8 +168969,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -169432,7 +169399,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1
; GFX11-FAKE16-NEXT: v_add3_u32 v30, v52, v49, 0x7fff
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v32
; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v51
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v39, v29, vcc_lo
@@ -169445,7 +169412,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v30, 16, 1
@@ -170086,11 +170052,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97
; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112
@@ -183442,223 +183409,223 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:252
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:128
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:156
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:76
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v112.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v151.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v100.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v103.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v114.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v114.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v115.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v115.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v117.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v117.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v132.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v128.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v133.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v134.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v147.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v31.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -183671,101 +183638,101 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB92_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v53.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v99.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v103.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v115.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v112.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v113.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h
@@ -183800,100 +183767,100 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h
; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l
; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
@@ -183931,11 +183898,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB92_2
; GFX11-TRUE16-NEXT: .LBB92_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -183950,16 +183917,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v113.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v100.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l
@@ -183969,10 +183936,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v102.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v101.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -183983,15 +183950,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v96.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -184007,13 +183974,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v85.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -184028,19 +183993,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v71.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l
@@ -184050,12 +184012,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -184066,16 +184026,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -184084,7 +184043,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v113.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
@@ -184094,103 +184053,103 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v70.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v112.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v115.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v115.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v114.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v69.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v114.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v100.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v103.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v64.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v101.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v55.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v84.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v87.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v86.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h
@@ -184266,207 +184225,204 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v127, off, s32 offset:384
; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64
; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:224
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_u16 v120, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_u16 v123, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_u16 v121, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_u16 v122, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:20
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v86, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v151, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v29
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v40
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v47
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v56
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v127
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v166
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v176
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v179
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v182
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v183
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v42
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v43
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v46
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v42, 8, v58
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v59
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v57
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v75
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v74
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v22
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v63
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v72
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v91
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v88
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v89
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v18
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v90
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v107
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v106
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v94
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v105
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v120
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v123
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v121
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v122
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -184477,10 +184433,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v96
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
@@ -184492,16 +184448,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v119
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v115
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v134
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v102
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v160
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v151
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v162
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v161
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100
@@ -184510,70 +184466,70 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v100
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v129
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v165
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v178
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v179
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v166
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v183
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v43
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v42
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v130
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v147
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v117
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v46
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v47
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v58
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v59
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v72
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v63
; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v112
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v71
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v164
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v74
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v57
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v75
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92
@@ -184583,16 +184539,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v177
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v83
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v97
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v44
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v41
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v45
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v99
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104
@@ -184608,16 +184564,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v113
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v114
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v62
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v73
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v76
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v133
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v128
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111
; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106
; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122
@@ -184649,94 +184605,94 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92
@@ -184765,11 +184721,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB92_4
; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v133, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v128, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v76, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v118, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v73, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -184781,15 +184737,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v61, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v128, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v114, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v62, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v113, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -184798,38 +184754,38 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v113, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v45, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v99, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v44, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v41, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v114, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v97, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v84, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v83, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v177, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -184838,142 +184794,134 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v70, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v167, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v164, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v82, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v163, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v84, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v71, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v144, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v74, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v112, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v71, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v117, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v80, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v68, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v72, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v64, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v147, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v148, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v58, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v56, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v47, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v46, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v130, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v145, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v132, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v135, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v43, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v42, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v183, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v116, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v103, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v100, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v179, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v178, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v165, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0
@@ -184987,13 +184935,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v162, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v151, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2
@@ -185005,15 +184953,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v119, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v102, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3
@@ -185023,11 +184971,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v87, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v86, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v81, v32
; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35
; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33
; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0
@@ -185045,28 +184993,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v101, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v117, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v112, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v80, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v69, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v68, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v67, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v71, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v84, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v82, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v83, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v97, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v114, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v99, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v85, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v113, v29, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v30, v128, v30, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v31, v118, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB92_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_clause 0x1f
@@ -188911,85 +188859,85 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:144
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
@@ -189097,7 +189045,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -189106,89 +189054,89 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v129
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v118
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v133
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v162
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
@@ -189216,90 +189164,90 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v58
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v62
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v73
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v79
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v78
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v77
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v92
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
@@ -189351,98 +189299,98 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v93
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v91
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v183
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v92
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v88
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v79
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v78
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v76
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v75
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v72
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v180, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v40, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v62
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v183, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v61
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v46, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v58
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v59
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v45
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v47
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v44
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v41
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v182
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
@@ -189467,84 +189415,84 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v162
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v160
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v161
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v147
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v163
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v130
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v128
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v144
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v119
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
@@ -189674,26 +189622,26 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v131
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v128, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v118, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v160, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
@@ -189703,11 +189651,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v116, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v43, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v183, 16, v33
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v180, 16, v29
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35
; GFX11-TRUE16-NEXT: .LBB93_3: ; %end
@@ -189752,38 +189700,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
+; GFX11-FAKE16-NEXT: s_clause 0x1d
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
@@ -189793,170 +189740,163 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:204
; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:260
+; GFX11-FAKE16-NEXT: s_clause 0xf
; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:268
; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:28
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v103, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v4
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v18
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v163, 8, v30
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v183
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v60
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v74
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v180
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v40
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v43
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v41
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v167
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v45
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v176
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v57
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v177
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v47
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v63
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v73
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v72
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v77
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v78
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v93
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v2
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB93_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
@@ -190010,16 +189950,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v85
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53
@@ -190027,158 +189967,158 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v102
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v129
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v116
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v80
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v145
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v132
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v160
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v101
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v161
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v151
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v166
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v162
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v163
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v113
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v178
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v180
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v128
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v183
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v134
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v47
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v44
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v59
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v76
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v75
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93
@@ -190231,14 +190171,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v75
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v76
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
@@ -190246,148 +190185,140 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v61
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v59
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v62
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v58
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v44
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v150
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v148, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v73, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v147, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v57, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v46, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v43, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v130
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v128
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v40, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v183, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v115
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v180, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v113
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v100
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v178, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v101
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v100, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v98, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v3
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v151
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v97
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v96, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v163, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v82
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v161, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v80
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
@@ -190396,71 +190327,70 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v145, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v131, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v132, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v129, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v116, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v112, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v103, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v102, v7
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v99, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v87, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v83, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v84, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v81, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v70, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v133, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7
@@ -190504,71 +190434,70 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v98
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v86, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v101, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v114, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v117
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v100, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v147, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v118, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB93_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
+; GFX11-FAKE16-NEXT: s_clause 0x1d
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:436
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB93_4:
@@ -194624,9 +194553,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
@@ -194690,7 +194619,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -194767,9 +194696,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
@@ -195139,11 +195067,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
@@ -195210,8 +195138,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -195418,9 +195346,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
@@ -195811,11 +195738,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81
; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71
@@ -207157,223 +207085,223 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:252
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:128
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:156
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:76
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v112.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v151.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v100.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v103.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v114.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v114.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v115.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v115.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v117.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v117.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v132.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v128.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v133.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v134.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v147.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v31.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -207386,101 +207314,101 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB96_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v53.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v99.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v103.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v115.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v112.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v113.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h
@@ -207515,100 +207443,100 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h
; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l
; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
@@ -207646,11 +207574,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2
; GFX11-TRUE16-NEXT: .LBB96_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -207665,16 +207593,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v113.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v100.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l
@@ -207684,10 +207612,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v102.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v101.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -207698,15 +207626,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v96.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -207722,13 +207650,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v85.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -207743,19 +207669,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v71.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l
@@ -207765,12 +207688,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
@@ -207781,16 +207702,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -207799,7 +207719,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v113.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
@@ -207809,103 +207729,103 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v70.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v112.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v115.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v115.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v114.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v69.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v114.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v100.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v103.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v64.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v101.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v55.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v84.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v87.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v86.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h
@@ -207981,207 +207901,204 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v127, off, s32 offset:384
; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64
; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:224
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_u16 v120, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_u16 v123, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_u16 v121, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_u16 v122, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:20
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v86, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v151, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v29
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v40
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v47
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v56
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v127
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v166
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v176
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v179
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v182
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v183
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v42
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v43
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v46
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v42, 8, v58
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v59
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v57
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v75
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v74
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v22
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v63
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v72
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v91
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v88
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v89
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v18
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v90
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v107
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v106
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v94
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v105
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v120
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v123
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v121
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v122
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -208192,10 +208109,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v96
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
@@ -208207,16 +208124,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v119
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v131
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v115
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v134
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v102
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v160
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v151
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v162
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v161
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100
@@ -208225,70 +208142,70 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v103
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v100
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v129
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v145
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v165
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v178
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v179
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v166
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v183
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v43
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v42
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v130
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v147
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v117
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v46
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v47
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v58
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v59
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v72
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v63
; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v112
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v71
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v163
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v164
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v74
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v57
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v75
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92
@@ -208298,16 +208215,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v177
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v83
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v97
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v44
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v41
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v45
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v99
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104
@@ -208323,16 +208240,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v113
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v114
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v62
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v73
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v76
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v133
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v128
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111
; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106
; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122
@@ -208364,94 +208281,94 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92
@@ -208480,11 +208397,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4
; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v133, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v128, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v76, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v118, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v73, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -208496,15 +208413,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v61, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v128, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v114, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v62, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v113, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -208513,38 +208430,38 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v113, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v45, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v99, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v44, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v41, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v114, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v97, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v84, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v83, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v177, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -208553,142 +208470,134 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v70, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v167, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v164, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v82, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v163, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v84, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v71, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v144, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v74, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v112, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v71, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v117, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v80, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v68, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v72, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v64, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v147, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v148, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v58, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v56, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v47, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v46, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v130, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v145, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v132, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v135, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v43, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v42, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v183, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v116, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v101, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v103, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v100, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v179, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v178, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v165, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0
@@ -208702,13 +208611,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v162, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v151, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2
@@ -208720,15 +208629,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v119, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v102, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3
@@ -208738,11 +208647,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v87, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v86, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v81, v32
; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35
; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33
; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0
@@ -208760,28 +208669,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v101, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v117, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v112, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v80, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v69, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v68, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v67, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v71, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v84, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v82, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v83, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v97, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v114, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v99, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v85, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v113, v29, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v30, v128, v30, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v31, v118, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB96_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_clause 0x1f
@@ -212565,85 +212474,85 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:144
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
@@ -212751,7 +212660,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -212760,89 +212669,89 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v129
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v118
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v133
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v162
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
@@ -212870,90 +212779,90 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v58
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v62
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v73
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v79
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v78
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v77
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v92
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
@@ -213005,98 +212914,98 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v93
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v91
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v183
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v92
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v88
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v79
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v78
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v76
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v75
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v72
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v180, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v40, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v62
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v183, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v61
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v46, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v58
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v59
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v45
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v47
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v44
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v41
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v182
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
@@ -213121,84 +213030,84 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v162
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v160
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v161
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v147
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v163
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v130
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v128
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v144
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v119
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
@@ -213328,26 +213237,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v131
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v128, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v118, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v160, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
@@ -213357,11 +213266,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v116, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v43, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v183, 16, v33
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v180, 16, v29
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35
; GFX11-TRUE16-NEXT: .LBB97_3: ; %end
@@ -213406,38 +213315,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
+; GFX11-FAKE16-NEXT: s_clause 0x1d
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:436
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
@@ -213447,170 +213355,163 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304
+; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:300
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296
+; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:292
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:204
; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:260
+; GFX11-FAKE16-NEXT: s_clause 0xf
; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:268
; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:308
+; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:28
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v103, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v4
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v18
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v163, 8, v30
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v183
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v60
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v74
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v180
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v40
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v43
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v41
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v167
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v45
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v176
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v57
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v177
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v47
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v63
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v73
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v72
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v77
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v78
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v93
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v2
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
@@ -213664,16 +213565,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v70
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v87
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v85
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53
@@ -213681,158 +213582,158 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v102
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v112
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v129
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v116
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v80
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v145
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v132
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v160
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v101
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v161
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v151
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v166
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v162
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v163
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v113
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v178
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v182
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v180
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v128
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v40
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v183
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v134
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v47
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v44
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v61
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v59
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v76
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v75
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93
@@ -213885,14 +213786,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v75
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v76
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
@@ -213900,148 +213800,140 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v61
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v59
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v62
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v58
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v44
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v150
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v148, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v73, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v147, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v57, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v46, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v43, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v130
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v128
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v40, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v183, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v115
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v180, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v113
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v100
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v178, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v101
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v100, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v98, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, 0x300, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v3
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v151
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v97
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v96, 0x300, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v163, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v82
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v161, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v80
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
@@ -214050,71 +213942,70 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v145, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v131, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v132, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v129, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v116, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v112, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v103, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v102, v7
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v99, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v87, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v83, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v84, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v81, v8
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v70, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v133, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7
@@ -214158,71 +214049,70 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v98
; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v86, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v101, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v134
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v114, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v148
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v117
; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v100, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v147, 16, v32
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v118, 16, v33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB97_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
+; GFX11-FAKE16-NEXT: s_clause 0x1d
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:320
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:324
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:328
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:332
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:336
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:340
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:344
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:348
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:352
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:356
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:360
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:364
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:368
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:372
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:376
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:380
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:384
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:388
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:392
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:396
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:400
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:404
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:408
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:412
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:416
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:420
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:424
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:428
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:432
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:436
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB97_4:
@@ -218482,9 +218372,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
@@ -218548,7 +218438,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -218625,9 +218515,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
@@ -218997,11 +218886,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
@@ -219068,8 +218957,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -219276,9 +219165,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
@@ -219669,11 +219557,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84
; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81
; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 35ab38c67b1ec..91689b9ef3465 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -5277,15 +5277,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
@@ -5319,14 +5319,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -5574,15 +5574,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0x9
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_u16 v37, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_u16 v38, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v3
@@ -5599,17 +5599,17 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -12764,15 +12764,15 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
@@ -12806,14 +12806,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -13061,15 +13061,15 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0x9
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_u16 v37, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_u16 v38, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v3
@@ -13086,17 +13086,17 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -19860,16 +19860,16 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -19908,14 +19908,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v33.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v35.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -19945,9 +19945,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v18.h
@@ -19985,9 +19985,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
@@ -20012,11 +20012,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -20104,16 +20103,16 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
; GFX11-FAKE16-NEXT: s_clause 0x9
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v3
@@ -20127,20 +20126,20 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v10
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -20187,13 +20186,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v65
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v25
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v69
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v23
@@ -20222,10 +20221,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
@@ -20238,7 +20237,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
@@ -20249,14 +20248,12 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -20289,7 +20286,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v64, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v66, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v18, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v16, 3
@@ -26437,16 +26434,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -26485,14 +26482,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v33.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v35.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -26522,9 +26519,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v18.h
@@ -26562,9 +26559,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
@@ -26589,11 +26586,10 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -26681,16 +26677,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
; GFX11-FAKE16-NEXT: s_clause 0x9
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v3
@@ -26704,20 +26700,20 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v10
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -26764,13 +26760,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v65
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v25
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v69
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v23
@@ -26799,10 +26795,10 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
@@ -26815,7 +26811,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
@@ -26826,14 +26822,12 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB62_2
; GFX11-FAKE16-NEXT: .LBB62_4: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -26866,7 +26860,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v64, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v66, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v18, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v16, 3
@@ -30795,15 +30789,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
@@ -30837,14 +30831,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -31092,15 +31086,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0x9
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v1
@@ -31118,17 +31112,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v10
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -35441,15 +35435,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
@@ -35483,14 +35477,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -35738,15 +35732,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0x9
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v1
@@ -35764,17 +35758,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v10
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 65fde2fd5e190..1e24ed30fd2e4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -12870,29 +12870,29 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
@@ -12936,40 +12936,40 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v64.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v65.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
@@ -13353,29 +13353,29 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76
; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68
@@ -13403,41 +13403,40 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v10
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v66
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v67
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v81
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v82
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v128
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v129
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -15125,16 +15124,16 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
@@ -15157,7 +15156,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
@@ -15278,17 +15277,17 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v39
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -15485,19 +15484,19 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v39
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -15544,18 +15543,18 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12
@@ -15576,24 +15575,24 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v14
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v86
; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -28229,29 +28228,29 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
@@ -28295,40 +28294,40 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v64.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v65.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
@@ -28712,29 +28711,29 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76
; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68
@@ -28762,41 +28761,40 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v10
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v66
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v67
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v81
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v82
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v128
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v129
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -30484,16 +30482,16 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
@@ -30516,7 +30514,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
@@ -30637,17 +30635,17 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v39
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -30844,19 +30842,19 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v39
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -30903,18 +30901,18 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12
@@ -30935,24 +30933,24 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v14
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v86
; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -42859,29 +42857,29 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
@@ -42925,40 +42923,40 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v64.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v65.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
@@ -43342,29 +43340,29 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76
; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68
@@ -43392,41 +43390,40 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v10
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v66
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v67
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v81
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v82
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v128
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v129
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -45114,16 +45111,16 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
@@ -45146,7 +45143,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
@@ -45267,17 +45264,17 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v39
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -45474,19 +45471,19 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v39
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -45533,18 +45530,18 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12
@@ -45565,24 +45562,24 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v14
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v86
; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB71_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -56643,29 +56640,29 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
@@ -56709,40 +56706,40 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v64.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v65.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
@@ -57126,29 +57123,29 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76
; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68
@@ -57176,41 +57173,40 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v10
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v66
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v67
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v81
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v82
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v128
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v129
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -58898,16 +58894,16 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
@@ -58930,7 +58926,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
@@ -59051,17 +59047,17 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v39
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -59258,19 +59254,19 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v39
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -59317,18 +59313,18 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48
; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12
@@ -59349,24 +59345,24 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v14
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v86
; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB87_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -70809,41 +70805,41 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
@@ -70879,18 +70875,20 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
@@ -70898,22 +70896,19 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v51.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v65
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -70941,22 +70936,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l
@@ -71005,22 +71000,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -71056,15 +71051,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v37.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l
@@ -71079,17 +71074,17 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v36.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l
@@ -71097,18 +71092,18 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -71199,41 +71194,41 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:20
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v5
@@ -71244,43 +71239,41 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v15
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v17
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v23
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v87
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v97
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v100
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v101
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v102
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v103
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v112
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v2
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -71317,8 +71310,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v53
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v52
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v83
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v71
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v6, v5, 0x5040100
@@ -71329,19 +71322,19 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v82
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v97
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v87
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v112
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v103
@@ -71350,16 +71343,16 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v10, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v13, v12, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v10, v15, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v86
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v82
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v66
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v98
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v101
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v100
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v114
@@ -71391,22 +71384,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
@@ -71417,15 +71410,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
@@ -71442,15 +71435,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v98, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v86, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v97, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v96, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v118, v0
@@ -71458,7 +71451,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v99, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v116, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v5
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v3
@@ -71469,46 +71462,46 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v117, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v85, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v84, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v85, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v84, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v82, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v102, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v101, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v112, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v103, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v70, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v71, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v69, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v96, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v87, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v97, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v25, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v2
@@ -71530,9 +71523,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v19, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v81, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v81, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v16, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v0
@@ -71586,12 +71579,12 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v19, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v25, v9, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v10, v68, v10, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v11, v70, v11, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v12, v69, v12, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v13, v67, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v67, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v82, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v80, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v68, v13, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v14, v66, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v65, v15, 0x5040100
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
@@ -72896,24 +72889,24 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:28
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
@@ -72944,7 +72937,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4
@@ -73035,10 +73028,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
@@ -73046,22 +73039,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -73121,42 +73114,43 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v70
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v85
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v71
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v69
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v84
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
@@ -73172,7 +73166,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v50
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
@@ -73288,21 +73282,21 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
@@ -73320,24 +73314,24 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v14
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v85
; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -73392,7 +73386,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v70
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34
@@ -73406,7 +73400,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v68
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -73416,26 +73410,26 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v69
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v66
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v82
; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85
; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
@@ -73451,8 +73445,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3
; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v68
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64
@@ -73462,14 +73456,14 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v80, v5
; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v67, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
@@ -73564,17 +73558,17 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v69
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v81
; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2
@@ -73586,7 +73580,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v82, v3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
@@ -83880,41 +83874,41 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
@@ -83950,18 +83944,20 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
@@ -83969,22 +83965,19 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v51.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v65
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -84012,22 +84005,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l
@@ -84076,22 +84069,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -84127,15 +84120,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v37.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l
@@ -84150,17 +84143,17 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v36.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l
@@ -84168,18 +84161,18 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -84270,41 +84263,41 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:20
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v5
@@ -84315,43 +84308,41 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v15
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v17
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v23
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v87
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v97
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v100
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v101
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v102
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v103
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v112
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v2
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -84388,8 +84379,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v53
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v52
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v83
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v71
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v6, v5, 0x5040100
@@ -84400,19 +84391,19 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v82
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v97
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v87
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v112
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v103
@@ -84421,16 +84412,16 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v10, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v13, v12, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v10, v15, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v86
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v82
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v66
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v98
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v101
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v100
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v114
@@ -84462,22 +84453,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
@@ -84488,15 +84479,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
@@ -84513,15 +84504,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v98, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v86, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v97, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v96, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v118, v0
@@ -84529,7 +84520,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v99, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v116, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v5
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v3
@@ -84540,46 +84531,46 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v117, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v85, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v84, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v85, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v84, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v82, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v102, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v101, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v112, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v103, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v70, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v71, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v69, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v96, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v87, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v97, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v25, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v2
@@ -84601,9 +84592,9 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v19, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v81, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v81, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v16, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v0
@@ -84657,12 +84648,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v19, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v25, v9, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v10, v68, v10, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v11, v70, v11, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v12, v69, v12, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v13, v67, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v67, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v82, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v80, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v68, v13, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v14, v66, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v65, v15, 0x5040100
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
@@ -85932,24 +85923,24 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:28
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
@@ -85980,7 +85971,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4
@@ -86071,10 +86062,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
@@ -86082,22 +86073,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -86157,42 +86148,43 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v70
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v85
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v71
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v69
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v84
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
@@ -86208,7 +86200,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v50
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
@@ -86324,21 +86316,21 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
@@ -86356,24 +86348,24 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v14
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v85
; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -86428,7 +86420,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v70
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34
@@ -86442,7 +86434,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v68
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -86452,26 +86444,26 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v69
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v66
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v82
; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85
; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
@@ -86487,8 +86479,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3
; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v68
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64
@@ -86498,14 +86490,14 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v80, v5
; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v67, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
@@ -86600,17 +86592,17 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v69
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v81
; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2
@@ -86622,7 +86614,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v82, v3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
@@ -95249,41 +95241,41 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
@@ -95319,18 +95311,20 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
@@ -95338,22 +95332,19 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v51.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v65
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -95381,22 +95372,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l
@@ -95445,22 +95436,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -95496,15 +95487,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v37.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v37.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l
@@ -95519,17 +95510,17 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v36.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l
@@ -95537,18 +95528,18 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -95639,41 +95630,41 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:20
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v5
@@ -95684,43 +95675,41 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v15
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v17
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v23
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v25
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v8
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v14
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v87
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v97
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v100
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v101
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v102
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v103
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v112
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v116
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v2
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -95757,8 +95746,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v53
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v52
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v83
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v71
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v6, v5, 0x5040100
@@ -95769,19 +95758,19 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v82
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v97
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v87
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v112
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v103
@@ -95790,16 +95779,16 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v10, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v13, v12, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v10, v15, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v86
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v82
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v66
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v98
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v101
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v100
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v114
@@ -95831,22 +95820,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
@@ -95857,15 +95846,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
@@ -95882,15 +95871,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v98, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v86, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v97, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v96, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v118, v0
@@ -95898,7 +95887,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v99, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v116, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v5
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v3
@@ -95909,46 +95898,46 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v117, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v85, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v83, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v84, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v82, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v85, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v84, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v82, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v102, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v101, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v112, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v103, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v70, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v71, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v69, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v96, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v87, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v97, v0
; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v25, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v2
@@ -95970,9 +95959,9 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v19, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v81, v3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v81, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v4
; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v16, 3
; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v0
@@ -96026,12 +96015,12 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v19, v7, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v25, v9, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v10, v68, v10, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v11, v70, v11, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v12, v69, v12, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v13, v67, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v67, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v82, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v80, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v68, v13, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v14, v66, v14, 0x5040100
-; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v65, v15, 0x5040100
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
@@ -97305,24 +97294,24 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:28
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
@@ -97353,7 +97342,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4
@@ -97444,10 +97433,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
@@ -97455,22 +97444,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -97530,42 +97519,43 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v70
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v85
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v71
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v69
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v84
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
@@ -97581,7 +97571,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v50
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
@@ -97697,21 +97687,21 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
@@ -97729,24 +97719,24 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v8
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v12
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v14
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v85
; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -97801,7 +97791,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v70
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34
@@ -97815,7 +97805,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v68
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -97825,26 +97815,26 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v69
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v66
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v82
; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85
; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
@@ -97860,8 +97850,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3
; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v68
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64
@@ -97871,14 +97861,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v80, v5
; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v67, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
@@ -97973,17 +97963,17 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v69
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v81
; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2
@@ -97995,7 +97985,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v82, v3
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 8ca3e8255b634..97b9b0b8d2786 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -103,40 +103,42 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX11-NEXT: s_mov_b32 s32, 0
; GISEL-GFX11-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 4
+; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 8
+; GISEL-GFX11-NEXT: s_add_u32 s26, s32, 12
+; GISEL-GFX11-NEXT: s_add_u32 s27, s32, 16
+; GISEL-GFX11-NEXT: s_add_u32 s28, s32, 20
+; GISEL-GFX11-NEXT: s_add_u32 s29, s32, 24
+; GISEL-GFX11-NEXT: s_add_u32 s30, s32, 28
+; GISEL-GFX11-NEXT: s_clause 0x7
; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32
; GISEL-GFX11-NEXT: scratch_store_b32 off, v17, s24
-; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 8
-; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 12
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v18, s24
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v19, s25
-; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 16
-; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 20
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v20, s24
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v21, s25
-; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 24
-; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 28
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v22, s24
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v23, s25
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v18, s25
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v19, s26
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v20, s27
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v21, s28
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v22, s29
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v23, s30
; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 32
; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 36
+; GISEL-GFX11-NEXT: s_add_u32 s26, s32, 40
+; GISEL-GFX11-NEXT: s_add_u32 s27, s32, 44
+; GISEL-GFX11-NEXT: s_add_u32 s28, s32, 48
+; GISEL-GFX11-NEXT: s_add_u32 s29, s32, 52
+; GISEL-GFX11-NEXT: s_add_u32 s30, s32, 56
+; GISEL-GFX11-NEXT: s_add_u32 s31, s32, 60
+; GISEL-GFX11-NEXT: s_clause 0x7
; GISEL-GFX11-NEXT: scratch_store_b32 off, v24, s24
; GISEL-GFX11-NEXT: scratch_store_b32 off, v25, s25
-; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 40
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
-; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 44
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v26, s24
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v27, s25
-; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 48
-; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 52
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v28, s24
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v29, s25
-; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 56
-; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 60
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v30, s24
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v31, s25
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v26, s26
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v27, s27
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v28, s28
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v29, s29
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v30, s30
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v31, s31
; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
@@ -231,40 +233,42 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
+; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
+; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10
+; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8
; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 60
+; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 56
+; DAGISEL-GFX11-NEXT: s_add_i32 s26, s32, 52
+; DAGISEL-GFX11-NEXT: s_add_i32 s27, s32, 48
+; DAGISEL-GFX11-NEXT: s_add_i32 s28, s32, 44
+; DAGISEL-GFX11-NEXT: s_add_i32 s29, s32, 40
+; DAGISEL-GFX11-NEXT: s_add_i32 s30, s32, 36
+; DAGISEL-GFX11-NEXT: s_clause 0x7
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v31, s24
-; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 56
-; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 52
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v30, s24
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v29, s25
-; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 48
-; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 44
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v28, s24
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v27, s25
-; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 40
-; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 36
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v26, s24
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v25, s25
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v30, s25
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v29, s26
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v28, s27
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v27, s28
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v26, s29
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v25, s30
; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 32
; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 28
+; DAGISEL-GFX11-NEXT: s_add_i32 s26, s32, 24
+; DAGISEL-GFX11-NEXT: s_add_i32 s27, s32, 20
+; DAGISEL-GFX11-NEXT: s_add_i32 s28, s32, 16
+; DAGISEL-GFX11-NEXT: s_add_i32 s29, s32, 12
+; DAGISEL-GFX11-NEXT: s_add_i32 s30, s32, 8
+; DAGISEL-GFX11-NEXT: s_add_i32 s31, s32, 4
+; DAGISEL-GFX11-NEXT: s_clause 0x7
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v24, s24
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v23, s25
-; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 24
-; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
-; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10
-; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8
-; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 20
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v22, s24
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v21, s25
-; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 16
-; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 12
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v20, s24
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v19, s25
-; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 8
-; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 4
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v18, s24
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v17, s25
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v22, s26
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v21, s27
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v20, s28
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v19, s29
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v18, s30
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v17, s31
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index e1bbc243344b0..ec2d3f5b1fd6b 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -15,9 +15,8 @@ declare void @llvm.amdgcn.s.barrier() #2
; SI-ALLOCA: buffer_load_dword [[LOAD_A:v[0-9]+]]
; SI-ALLOCA: buffer_load_dword [[LOAD_B:v[0-9]+]]
-; SI-ALLOCA: v_lshlrev_b32_e32 [[SIZE_SCALE:v[0-9]+]], 2, [[LOAD_A]]
+; SI-ALLOCA: v_lshlrev_b32_e32 [[PTRREG:v[0-9]+]], 2, [[LOAD_B]]
-; SI-ALLOCA: v_mov_b32_e32 [[PTRREG:v[0-9]+]], [[SIZE_SCALE]]
; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
; SI-ALLOCA: s_barrier
; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 51caa84450ff3..f546fb46acc00 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -371,21 +371,22 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s15
; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_add_u32 s16, s8, 8
+; GFX8-NEXT: s_addc_u32 s17, s9, 0
+; GFX8-NEXT: s_getpc_b64 s[18:19]
+; GFX8-NEXT: s_add_u32 s18, s18, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s19, s19, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX8-NEXT: s_add_u32 s8, s8, 8
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
-; GFX8-NEXT: s_getpc_b64 s[16:17]
-; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX8-NEXT: s_mov_b64 s[8:9], s[16:17]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s15
; GFX8-NEXT: s_mov_b32 s32, 0
-; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX8-NEXT: s_endpgm
;
; GFX8-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
@@ -394,16 +395,16 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr
; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7]
+; GFX8-ARCH-FLAT-NEXT: s_add_u32 s6, s6, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
-; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
-; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
-; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2
; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
@@ -417,20 +418,21 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s15
; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_add_u32 s16, s8, 8
+; GFX9-NEXT: s_addc_u32 s17, s9, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 s8, s8, 8
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[8:9], s[16:17]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s15
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: s_endpgm
;
; GFX9-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
@@ -439,15 +441,15 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr
; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7]
+; GFX9-ARCH-FLAT-NEXT: s_add_u32 s6, s6, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
-; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
-; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
-; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
@@ -463,13 +465,13 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr
; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
-; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
-; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
-; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7]
+; GFX942-ARCH-FLAT-NEXT: s_add_u32 s6, s6, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0
@@ -483,20 +485,22 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s0, s0, s15
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX10-NEXT: s_add_u32 s8, s8, 8
-; GFX10-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: s_add_u32 s16, s8, 8
+; GFX10-NEXT: s_addc_u32 s17, s9, 0
+; GFX10-NEXT: s_getpc_b64 s[18:19]
+; GFX10-NEXT: s_add_u32 s18, s18, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s19, s19, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b64 s[8:9], s[16:17]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s15
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX10-NEXT: s_endpgm
call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
ret void
@@ -736,21 +740,22 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s15
; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_add_u32 s16, s8, 8
+; GFX8-NEXT: s_addc_u32 s17, s9, 0
+; GFX8-NEXT: s_getpc_b64 s[18:19]
+; GFX8-NEXT: s_add_u32 s18, s18, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s19, s19, calls_intrin_ascast at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX8-NEXT: s_add_u32 s8, s8, 8
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
-; GFX8-NEXT: s_getpc_b64 s[16:17]
-; GFX8-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
-; GFX8-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
-; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX8-NEXT: s_mov_b64 s[8:9], s[16:17]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s15
; GFX8-NEXT: s_mov_b32 s32, 0
-; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX8-NEXT: s_endpgm
;
; GFX8-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel:
@@ -759,16 +764,16 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7]
+; GFX8-ARCH-FLAT-NEXT: s_add_u32 s6, s6, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, calls_intrin_ascast at gotpcrel32@hi+12
; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
-; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
-; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast at gotpcrel32@lo+4
-; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast at gotpcrel32@hi+12
-; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
-; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2
; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
@@ -782,20 +787,21 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s15
; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_add_u32 s16, s8, 8
+; GFX9-NEXT: s_addc_u32 s17, s9, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, calls_intrin_ascast at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 s8, s8, 8
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[8:9], s[16:17]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s15
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: s_endpgm
;
; GFX9-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel:
@@ -804,15 +810,15 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7]
+; GFX9-ARCH-FLAT-NEXT: s_add_u32 s6, s6, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, calls_intrin_ascast at gotpcrel32@hi+12
; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
-; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
-; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast at gotpcrel32@lo+4
-; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast at gotpcrel32@hi+12
-; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
-; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
@@ -828,13 +834,13 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
-; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
-; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
-; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast at gotpcrel32@lo+4
-; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast at gotpcrel32@hi+12
-; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7]
+; GFX942-ARCH-FLAT-NEXT: s_add_u32 s6, s6, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0
@@ -848,20 +854,22 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s0, s0, s15
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX10-NEXT: s_add_u32 s8, s8, 8
-; GFX10-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX10-NEXT: s_add_u32 s16, s8, 8
+; GFX10-NEXT: s_addc_u32 s17, s9, 0
+; GFX10-NEXT: s_getpc_b64 s[18:19]
+; GFX10-NEXT: s_add_u32 s18, s18, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s19, s19, calls_intrin_ascast at gotpcrel32@hi+12
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b64 s[8:9], s[16:17]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s15
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX10-NEXT: s_endpgm
call void @calls_intrin_ascast(ptr addrspace(3) %ptr)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index c4957fd44e2be..338ea4a133e48 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2140,6 +2140,7 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, 0x3f80
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, 0x4228
+; GFX11TRUE16-NEXT: s_clause 0x1
; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v4, off
; GFX11TRUE16-NEXT: global_store_d16_hi_b16 v[2:3], v4, off
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -2149,6 +2150,7 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, 0x3f80
; GFX11FAKE16-NEXT: v_mov_b32_e32 v5, 0x4228
+; GFX11FAKE16-NEXT: s_clause 0x1
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -43111,32 +43113,32 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: s_clause 0x1f
; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
-; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68
-; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:128
-; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:64
-; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:60
+; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:124
+; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:128
+; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:60
+; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:64
+; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:112
+; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:116
; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:120
; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:56
-; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:116
+; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:48
; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:52
-; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:112
-; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:48
+; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:100
+; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:104
; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108
; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:44
-; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:104
+; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:36
; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:40
-; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:100
-; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:36
-; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:96
-; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:32
+; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:96
+; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:32
+; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:84
+; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:88
; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:92
; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:28
-; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:88
+; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:20
; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:24
-; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:84
-; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:20
+; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:68
+; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:72
; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80
; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16
@@ -43207,55 +43209,52 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v26
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v31
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v35.l, v36.l, s26
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v34.l, v37.l, s27
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v34.h, v37.h, s28
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v32.l, v34.l, s27
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v35.l, s26
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v32.h, v34.h, s28
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v38.l, v39.l, s29
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v38.h, v39.h, s25
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v48.l, v49.l, s24
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v48.h, v49.h, s23
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v50.l, v51.l, s22
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v50.h, v51.h, s21
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v37.l, v49.l, s24
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v37.h, v49.h, s23
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v36.l, v48.l, s22
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v36.h, v48.h, s21
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v52.l, v53.l, s20
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v52.h, v53.h, s19
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v54.l, v55.l, s18
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v54.h, v55.h, s17
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v51.l, v55.l, s18
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v51.h, v55.h, s17
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v50.l, v54.l, s16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v50.h, v54.h, s15
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v64.l, v65.l, s16
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v64.h, v65.h, s15
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v66.l, v67.l, s14
-; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v66.h, v67.h, s13
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v64.l, v65.l, s14
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v64.h, v65.h, s13
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10)
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v68.l, v69.l, s12
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v68.h, v69.h, s11
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v70.l, v71.l, s10
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.h, v71.h, s9
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v80.l, v81.l, s8
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v80.h, v81.h, s7
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v67.l, v71.l, s10
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v67.h, v71.h, s9
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v66.l, v70.l, s8
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v66.h, v70.h, s7
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v83.l, v84.l, s6
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v82.l, v85.l, s4
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v33.l, v86.l, s2
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v81.l, v86.l, s2
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v87.l, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v80.l, v87.l, s0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v16
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v32.h, v87.h, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v33.h, v86.h, s1
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v80.h, v87.h, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v81.h, v86.h, s1
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v82.h, v85.h, s3
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v83.h, v84.h, s5
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v35.h, v36.h, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v33.h, v35.h, s0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 178b138b57141..b332c411cc715 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -5577,12 +5577,13 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s22, s32, 8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20
+; GFX11-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
; GFX11-NEXT: v_mov_b32_e32 v2, s19
; GFX11-NEXT: s_add_i32 s19, s32, 4
; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT: scratch_store_b32 off, v0, s22
-; GFX11-NEXT: scratch_store_b32 off, v1, s19
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_store_b32 off, v0, s19
+; GFX11-NEXT: scratch_store_b32 off, v1, s22
; GFX11-NEXT: scratch_store_b32 off, v2, s32
; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39
; GFX11-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38
@@ -6062,6 +6063,7 @@ define void @stack_12xv3i32() #0 {
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b32 off, v4, s0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0
@@ -6403,6 +6405,7 @@ define void @stack_12xv3f32() #0 {
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b32 off, v4, s0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
@@ -6773,6 +6776,7 @@ define void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b32 off, v8, s0
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
@@ -7147,6 +7151,7 @@ define void @stack_8xv5f32() #0 {
; GFX11-NEXT: s_add_i32 s0, s32, 32
; GFX11-NEXT: s_add_i32 s1, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b32 off, v8, s0
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 9f48c8b5fe49c..c5b34bd805318 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -109,6 +109,7 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base
; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_different_bases:
; FLATSCR_GFX10: ; %bb.0: ; %bb
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR_GFX10-NEXT: s_clause 0x1
; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off
; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -125,6 +126,7 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base
; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases:
; GFX11-FAKE16: ; %bb.0: ; %bb
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_load_u16 v0, v0, off
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -372,6 +374,7 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_
; GFX10-LABEL: chain_hi_to_lo_global_different_bases:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -388,6 +391,7 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases:
; GFX11-FAKE16: ; %bb.0: ; %bb
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index f7c58ca9599b4..1984c0205633c 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -139,6 +139,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 3e0837b58aafc..702d26e4c14ad 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -506,6 +506,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index 986dd8a046424..7ac0b81a67d81 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -366,6 +366,7 @@ define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> in
; GFX11-LABEL: no_cluster_image_load:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_load_mip v[2:5], [v0, v1, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: image_load_mip v[6:9], [v0, v1, v6], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index 52ccfe8ba3bfb..497380ff0ae6e 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -1,17 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,SICIVI,VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; GCN-LABEL: {{^}}load_i32:
-; GCN-DAG: s_mov_b32 s3, 0
-; GCN-DAG: s_mov_b32 s2, s1
-; GCN-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
-; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
-; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
-; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
define amdgpu_vs float @load_i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_i32:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SICI-NEXT: s_load_dword s1, s[2:3], 0x2
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: s_add_i32 s0, s0, s1
+; SICI-NEXT: v_mov_b32_e32 v0, s0
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s5, s[2:3], 0x8
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s4, s4, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds i32, ptr addrspace(6) %p1, i32 2
%r0 = load i32, ptr addrspace(6) %p0
%r1 = load i32, ptr addrspace(6) %gep1
@@ -20,20 +48,48 @@ define amdgpu_vs float @load_i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) in
ret float %r2
}
-; GCN-LABEL: {{^}}load_v2i32:
-; SICIVI-DAG: s_mov_b32 s3, 0
-; SICIVI-DAG: s_mov_b32 s2, s1
-; SICIVI-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
-; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
-; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
-; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
-; GFX9-DAG: s_mov_b32 s2, s1
-; GFX9-DAG: s_mov_b32 s3, 0
-; GFX9-DAG: s_mov_b32 s1, s3
-; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
-; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
define amdgpu_vs <2 x float> @load_v2i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_v2i32:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SICI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: s_add_i32 s0, s0, s2
+; SICI-NEXT: s_add_i32 s1, s1, s3
+; SICI-NEXT: v_mov_b32_e32 v0, s0
+; SICI-NEXT: v_mov_b32_e32 v1, s1
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s0, s0, s2
+; VI-NEXT: s_add_i32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x10
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds <2 x i32>, ptr addrspace(6) %p1, i32 2
%r0 = load <2 x i32>, ptr addrspace(6) %p0
%r1 = load <2 x i32>, ptr addrspace(6) %gep1
@@ -42,17 +98,60 @@ define amdgpu_vs <2 x float> @load_v2i32(ptr addrspace(6) inreg %p0, ptr addrspa
ret <2 x float> %r2
}
-; GCN-LABEL: {{^}}load_v4i32:
-; GCN-DAG: s_mov_b32 s3, 0
-; GCN-DAG: s_mov_b32 s2, s1
-; GCN-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
-; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8
-; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
-; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
-; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
-; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
define amdgpu_vs <4 x float> @load_v4i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_v4i32:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s5, 0
+; SICI-NEXT: s_mov_b32 s4, s1
+; SICI-NEXT: s_mov_b32 s1, s5
+; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SICI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: s_add_i32 s0, s0, s4
+; SICI-NEXT: s_add_i32 s1, s1, s5
+; SICI-NEXT: s_add_i32 s2, s2, s6
+; SICI-NEXT: s_add_i32 s3, s3, s7
+; SICI-NEXT: v_mov_b32_e32 v0, s0
+; SICI-NEXT: v_mov_b32_e32 v1, s1
+; SICI-NEXT: v_mov_b32_e32 v2, s2
+; SICI-NEXT: v_mov_b32_e32 v3, s3
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_v4i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s5, 0
+; VI-NEXT: s_mov_b32 s4, s1
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x20
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s0, s0, s4
+; VI-NEXT: s_add_i32 s1, s1, s5
+; VI-NEXT: s_add_i32 s2, s2, s6
+; VI-NEXT: s_add_i32 s3, s3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x20
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s8
+; GFX9-NEXT: s_add_i32 s1, s5, s9
+; GFX9-NEXT: s_add_i32 s2, s6, s10
+; GFX9-NEXT: s_add_i32 s3, s7, s11
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds <4 x i32>, ptr addrspace(6) %p1, i32 2
%r0 = load <4 x i32>, ptr addrspace(6) %p0
%r1 = load <4 x i32>, ptr addrspace(6) %gep1
@@ -61,17 +160,84 @@ define amdgpu_vs <4 x float> @load_v4i32(ptr addrspace(6) inreg %p0, ptr addrspa
ret <4 x float> %r2
}
-; GCN-LABEL: {{^}}load_v8i32:
-; GCN-DAG: s_mov_b32 s3, 0
-; GCN-DAG: s_mov_b32 s2, s1
-; GCN-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
-; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10
-; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
-; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
-; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
-; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
define amdgpu_vs <8 x float> @load_v8i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_v8i32:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x10
+; SICI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: s_add_i32 s0, s12, s4
+; SICI-NEXT: s_add_i32 s1, s13, s5
+; SICI-NEXT: s_add_i32 s2, s14, s6
+; SICI-NEXT: s_add_i32 s3, s15, s7
+; SICI-NEXT: s_add_i32 s4, s16, s8
+; SICI-NEXT: s_add_i32 s5, s17, s9
+; SICI-NEXT: s_add_i32 s6, s18, s10
+; SICI-NEXT: s_add_i32 s7, s19, s11
+; SICI-NEXT: v_mov_b32_e32 v0, s0
+; SICI-NEXT: v_mov_b32_e32 v1, s1
+; SICI-NEXT: v_mov_b32_e32 v2, s2
+; SICI-NEXT: v_mov_b32_e32 v3, s3
+; SICI-NEXT: v_mov_b32_e32 v4, s4
+; SICI-NEXT: v_mov_b32_e32 v5, s5
+; SICI-NEXT: v_mov_b32_e32 v6, s6
+; SICI-NEXT: v_mov_b32_e32 v7, s7
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_v8i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x40
+; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s0, s12, s4
+; VI-NEXT: s_add_i32 s1, s13, s5
+; VI-NEXT: s_add_i32 s2, s14, s6
+; VI-NEXT: s_add_i32 s3, s15, s7
+; VI-NEXT: s_add_i32 s4, s16, s8
+; VI-NEXT: s_add_i32 s5, s17, s9
+; VI-NEXT: s_add_i32 s6, s18, s10
+; VI-NEXT: s_add_i32 s7, s19, s11
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_v8i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x40
+; GFX9-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s12, s4
+; GFX9-NEXT: s_add_i32 s1, s13, s5
+; GFX9-NEXT: s_add_i32 s2, s14, s6
+; GFX9-NEXT: s_add_i32 s3, s15, s7
+; GFX9-NEXT: s_add_i32 s4, s16, s8
+; GFX9-NEXT: s_add_i32 s5, s17, s9
+; GFX9-NEXT: s_add_i32 s6, s18, s10
+; GFX9-NEXT: s_add_i32 s7, s19, s11
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds <8 x i32>, ptr addrspace(6) %p1, i32 2
%r0 = load <8 x i32>, ptr addrspace(6) %p0
%r1 = load <8 x i32>, ptr addrspace(6) %gep1
@@ -80,17 +246,132 @@ define amdgpu_vs <8 x float> @load_v8i32(ptr addrspace(6) inreg %p0, ptr addrspa
ret <8 x float> %r2
}
-; GCN-LABEL: {{^}}load_v16i32:
-; GCN-DAG: s_mov_b32 s3, 0
-; GCN-DAG: s_mov_b32 s2, s1
-; GCN-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
-; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20
-; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
-; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
-; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
-; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
define amdgpu_vs <16 x float> @load_v16i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_v16i32:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x20
+; SICI-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: s_add_i32 s0, s36, s4
+; SICI-NEXT: s_add_i32 s1, s37, s5
+; SICI-NEXT: s_add_i32 s2, s38, s6
+; SICI-NEXT: s_add_i32 s3, s39, s7
+; SICI-NEXT: s_add_i32 s4, s40, s8
+; SICI-NEXT: s_add_i32 s5, s41, s9
+; SICI-NEXT: s_add_i32 s6, s42, s10
+; SICI-NEXT: s_add_i32 s7, s43, s11
+; SICI-NEXT: s_add_i32 s8, s44, s12
+; SICI-NEXT: s_add_i32 s9, s45, s13
+; SICI-NEXT: s_add_i32 s10, s46, s14
+; SICI-NEXT: s_add_i32 s11, s47, s15
+; SICI-NEXT: s_add_i32 s12, s48, s16
+; SICI-NEXT: s_add_i32 s13, s49, s17
+; SICI-NEXT: s_add_i32 s14, s50, s18
+; SICI-NEXT: s_add_i32 s15, s51, s19
+; SICI-NEXT: v_mov_b32_e32 v0, s0
+; SICI-NEXT: v_mov_b32_e32 v1, s1
+; SICI-NEXT: v_mov_b32_e32 v2, s2
+; SICI-NEXT: v_mov_b32_e32 v3, s3
+; SICI-NEXT: v_mov_b32_e32 v4, s4
+; SICI-NEXT: v_mov_b32_e32 v5, s5
+; SICI-NEXT: v_mov_b32_e32 v6, s6
+; SICI-NEXT: v_mov_b32_e32 v7, s7
+; SICI-NEXT: v_mov_b32_e32 v8, s8
+; SICI-NEXT: v_mov_b32_e32 v9, s9
+; SICI-NEXT: v_mov_b32_e32 v10, s10
+; SICI-NEXT: v_mov_b32_e32 v11, s11
+; SICI-NEXT: v_mov_b32_e32 v12, s12
+; SICI-NEXT: v_mov_b32_e32 v13, s13
+; SICI-NEXT: v_mov_b32_e32 v14, s14
+; SICI-NEXT: v_mov_b32_e32 v15, s15
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_v16i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x80
+; VI-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s0, s36, s4
+; VI-NEXT: s_add_i32 s1, s37, s5
+; VI-NEXT: s_add_i32 s2, s38, s6
+; VI-NEXT: s_add_i32 s3, s39, s7
+; VI-NEXT: s_add_i32 s4, s40, s8
+; VI-NEXT: s_add_i32 s5, s41, s9
+; VI-NEXT: s_add_i32 s6, s42, s10
+; VI-NEXT: s_add_i32 s7, s43, s11
+; VI-NEXT: s_add_i32 s8, s44, s12
+; VI-NEXT: s_add_i32 s9, s45, s13
+; VI-NEXT: s_add_i32 s10, s46, s14
+; VI-NEXT: s_add_i32 s11, s47, s15
+; VI-NEXT: s_add_i32 s12, s48, s16
+; VI-NEXT: s_add_i32 s13, s49, s17
+; VI-NEXT: s_add_i32 s14, s50, s18
+; VI-NEXT: s_add_i32 s15, s51, s19
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mov_b32_e32 v9, s9
+; VI-NEXT: v_mov_b32_e32 v10, s10
+; VI-NEXT: v_mov_b32_e32 v11, s11
+; VI-NEXT: v_mov_b32_e32 v12, s12
+; VI-NEXT: v_mov_b32_e32 v13, s13
+; VI-NEXT: v_mov_b32_e32 v14, s14
+; VI-NEXT: v_mov_b32_e32 v15, s15
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_v16i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x80
+; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s36, s4
+; GFX9-NEXT: s_add_i32 s1, s37, s5
+; GFX9-NEXT: s_add_i32 s2, s38, s6
+; GFX9-NEXT: s_add_i32 s3, s39, s7
+; GFX9-NEXT: s_add_i32 s4, s40, s8
+; GFX9-NEXT: s_add_i32 s5, s41, s9
+; GFX9-NEXT: s_add_i32 s6, s42, s10
+; GFX9-NEXT: s_add_i32 s7, s43, s11
+; GFX9-NEXT: s_add_i32 s8, s44, s12
+; GFX9-NEXT: s_add_i32 s9, s45, s13
+; GFX9-NEXT: s_add_i32 s10, s46, s14
+; GFX9-NEXT: s_add_i32 s11, s47, s15
+; GFX9-NEXT: s_add_i32 s12, s48, s16
+; GFX9-NEXT: s_add_i32 s13, s49, s17
+; GFX9-NEXT: s_add_i32 s14, s50, s18
+; GFX9-NEXT: s_add_i32 s15, s51, s19
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
+; GFX9-NEXT: v_mov_b32_e32 v8, s8
+; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_mov_b32_e32 v10, s10
+; GFX9-NEXT: v_mov_b32_e32 v11, s11
+; GFX9-NEXT: v_mov_b32_e32 v12, s12
+; GFX9-NEXT: v_mov_b32_e32 v13, s13
+; GFX9-NEXT: v_mov_b32_e32 v14, s14
+; GFX9-NEXT: v_mov_b32_e32 v15, s15
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds <16 x i32>, ptr addrspace(6) %p1, i32 2
%r0 = load <16 x i32>, ptr addrspace(6) %p0
%r1 = load <16 x i32>, ptr addrspace(6) %gep1
@@ -99,17 +380,42 @@ define amdgpu_vs <16 x float> @load_v16i32(ptr addrspace(6) inreg %p0, ptr addrs
ret <16 x float> %r2
}
-; GCN-LABEL: {{^}}load_float:
-; GCN-DAG: s_mov_b32 s3, 0
-; GCN-DAG: s_mov_b32 s2, s1
-; GCN-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
-; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
-; VI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
-; VI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
-; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
-; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
define amdgpu_vs float @load_float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_float:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dword s2, s[2:3], 0x2
+; SICI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: v_mov_b32_e32 v0, s2
+; SICI-NEXT: v_add_f32_e32 v0, s0, v0
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_float:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dword s2, s[2:3], 0x8
+; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_add_f32_e32 v0, s0, v0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_float:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8
+; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_add_f32_e32 v0, s5, v0
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds float, ptr addrspace(6) %p1, i32 2
%r0 = load float, ptr addrspace(6) %p0
%r1 = load float, ptr addrspace(6) %gep1
@@ -117,20 +423,48 @@ define amdgpu_vs float @load_float(ptr addrspace(6) inreg %p0, ptr addrspace(6)
ret float %r
}
-; GCN-LABEL: {{^}}load_v2float:
-; SICIVI-DAG: s_mov_b32 s3, 0
-; SICIVI-DAG: s_mov_b32 s2, s1
-; SICIVI-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
-; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
-; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
-; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
-; GFX9-DAG: s_mov_b32 s2, s1
-; GFX9-DAG: s_mov_b32 s3, 0
-; GFX9-DAG: s_mov_b32 s1, s3
-; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
-; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
define amdgpu_vs <2 x float> @load_v2float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_v2float:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: v_mov_b32_e32 v0, s4
+; SICI-NEXT: v_mov_b32_e32 v1, s5
+; SICI-NEXT: v_add_f32_e32 v0, s0, v0
+; SICI-NEXT: v_add_f32_e32 v1, s1, v1
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_v2float:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x10
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_f32_e32 v0, s0, v0
+; VI-NEXT: v_add_f32_e32 v1, s1, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_v2float:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x10
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX9-NEXT: v_add_f32_e32 v1, s1, v1
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds <2 x float>, ptr addrspace(6) %p1, i32 2
%r0 = load <2 x float>, ptr addrspace(6) %p0
%r1 = load <2 x float>, ptr addrspace(6) %gep1
@@ -138,17 +472,60 @@ define amdgpu_vs <2 x float> @load_v2float(ptr addrspace(6) inreg %p0, ptr addrs
ret <2 x float> %r
}
-; GCN-LABEL: {{^}}load_v4float:
-; GCN-DAG: s_mov_b32 s3, 0
-; GCN-DAG: s_mov_b32 s2, s1
-; GCN-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
-; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8
-; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
-; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
-; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
-; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
define amdgpu_vs <4 x float> @load_v4float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_v4float:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: v_mov_b32_e32 v0, s4
+; SICI-NEXT: v_mov_b32_e32 v1, s5
+; SICI-NEXT: v_mov_b32_e32 v2, s6
+; SICI-NEXT: v_mov_b32_e32 v3, s7
+; SICI-NEXT: v_add_f32_e32 v0, s0, v0
+; SICI-NEXT: v_add_f32_e32 v1, s1, v1
+; SICI-NEXT: v_add_f32_e32 v2, s2, v2
+; SICI-NEXT: v_add_f32_e32 v3, s3, v3
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_v4float:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x20
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_f32_e32 v0, s0, v0
+; VI-NEXT: v_add_f32_e32 v1, s1, v1
+; VI-NEXT: v_add_f32_e32 v2, s2, v2
+; VI-NEXT: v_add_f32_e32 v3, s3, v3
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_v4float:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x20
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX9-NEXT: v_add_f32_e32 v1, s1, v1
+; GFX9-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX9-NEXT: v_add_f32_e32 v3, s3, v3
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds <4 x float>, ptr addrspace(6) %p1, i32 2
%r0 = load <4 x float>, ptr addrspace(6) %p0
%r1 = load <4 x float>, ptr addrspace(6) %gep1
@@ -156,17 +533,84 @@ define amdgpu_vs <4 x float> @load_v4float(ptr addrspace(6) inreg %p0, ptr addrs
ret <4 x float> %r
}
-; GCN-LABEL: {{^}}load_v8float:
-; GCN-DAG: s_mov_b32 s3, 0
-; GCN-DAG: s_mov_b32 s2, s1
-; GCN-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
-; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10
-; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
-; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
-; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
-; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
define amdgpu_vs <8 x float> @load_v8float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_v8float:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x10
+; SICI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: v_mov_b32_e32 v0, s4
+; SICI-NEXT: v_mov_b32_e32 v1, s5
+; SICI-NEXT: v_mov_b32_e32 v2, s6
+; SICI-NEXT: v_mov_b32_e32 v3, s7
+; SICI-NEXT: v_mov_b32_e32 v4, s8
+; SICI-NEXT: v_mov_b32_e32 v5, s9
+; SICI-NEXT: v_mov_b32_e32 v6, s10
+; SICI-NEXT: v_mov_b32_e32 v7, s11
+; SICI-NEXT: v_add_f32_e32 v0, s12, v0
+; SICI-NEXT: v_add_f32_e32 v1, s13, v1
+; SICI-NEXT: v_add_f32_e32 v2, s14, v2
+; SICI-NEXT: v_add_f32_e32 v3, s15, v3
+; SICI-NEXT: v_add_f32_e32 v4, s16, v4
+; SICI-NEXT: v_add_f32_e32 v5, s17, v5
+; SICI-NEXT: v_add_f32_e32 v6, s18, v6
+; SICI-NEXT: v_add_f32_e32 v7, s19, v7
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_v8float:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x40
+; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_mov_b32_e32 v6, s10
+; VI-NEXT: v_mov_b32_e32 v7, s11
+; VI-NEXT: v_add_f32_e32 v0, s12, v0
+; VI-NEXT: v_add_f32_e32 v1, s13, v1
+; VI-NEXT: v_add_f32_e32 v2, s14, v2
+; VI-NEXT: v_add_f32_e32 v3, s15, v3
+; VI-NEXT: v_add_f32_e32 v4, s16, v4
+; VI-NEXT: v_add_f32_e32 v5, s17, v5
+; VI-NEXT: v_add_f32_e32 v6, s18, v6
+; VI-NEXT: v_add_f32_e32 v7, s19, v7
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_v8float:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x40
+; GFX9-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-NEXT: v_mov_b32_e32 v7, s11
+; GFX9-NEXT: v_add_f32_e32 v0, s12, v0
+; GFX9-NEXT: v_add_f32_e32 v1, s13, v1
+; GFX9-NEXT: v_add_f32_e32 v2, s14, v2
+; GFX9-NEXT: v_add_f32_e32 v3, s15, v3
+; GFX9-NEXT: v_add_f32_e32 v4, s16, v4
+; GFX9-NEXT: v_add_f32_e32 v5, s17, v5
+; GFX9-NEXT: v_add_f32_e32 v6, s18, v6
+; GFX9-NEXT: v_add_f32_e32 v7, s19, v7
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds <8 x float>, ptr addrspace(6) %p1, i32 2
%r0 = load <8 x float>, ptr addrspace(6) %p0
%r1 = load <8 x float>, ptr addrspace(6) %gep1
@@ -174,17 +618,132 @@ define amdgpu_vs <8 x float> @load_v8float(ptr addrspace(6) inreg %p0, ptr addrs
ret <8 x float> %r
}
-; GCN-LABEL: {{^}}load_v16float:
-; GCN-DAG: s_mov_b32 s3, 0
-; GCN-DAG: s_mov_b32 s2, s1
-; GCN-DAG: s_mov_b32 s1, s3
-; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
-; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20
-; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
-; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
-; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
-; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
define amdgpu_vs <16 x float> @load_v16float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+; SICI-LABEL: load_v16float:
+; SICI: ; %bb.0:
+; SICI-NEXT: s_mov_b32 s3, 0
+; SICI-NEXT: s_mov_b32 s2, s1
+; SICI-NEXT: s_mov_b32 s1, s3
+; SICI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x20
+; SICI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x0
+; SICI-NEXT: s_waitcnt lgkmcnt(0)
+; SICI-NEXT: v_mov_b32_e32 v0, s16
+; SICI-NEXT: v_mov_b32_e32 v1, s17
+; SICI-NEXT: v_mov_b32_e32 v2, s18
+; SICI-NEXT: v_mov_b32_e32 v3, s19
+; SICI-NEXT: v_mov_b32_e32 v4, s20
+; SICI-NEXT: v_mov_b32_e32 v5, s21
+; SICI-NEXT: v_mov_b32_e32 v6, s22
+; SICI-NEXT: v_mov_b32_e32 v7, s23
+; SICI-NEXT: v_mov_b32_e32 v8, s24
+; SICI-NEXT: v_mov_b32_e32 v9, s25
+; SICI-NEXT: v_mov_b32_e32 v10, s26
+; SICI-NEXT: v_mov_b32_e32 v11, s27
+; SICI-NEXT: v_mov_b32_e32 v12, s28
+; SICI-NEXT: v_mov_b32_e32 v13, s29
+; SICI-NEXT: v_mov_b32_e32 v14, s30
+; SICI-NEXT: v_mov_b32_e32 v15, s31
+; SICI-NEXT: v_add_f32_e32 v0, s0, v0
+; SICI-NEXT: v_add_f32_e32 v1, s1, v1
+; SICI-NEXT: v_add_f32_e32 v2, s2, v2
+; SICI-NEXT: v_add_f32_e32 v3, s3, v3
+; SICI-NEXT: v_add_f32_e32 v4, s4, v4
+; SICI-NEXT: v_add_f32_e32 v5, s5, v5
+; SICI-NEXT: v_add_f32_e32 v6, s6, v6
+; SICI-NEXT: v_add_f32_e32 v7, s7, v7
+; SICI-NEXT: v_add_f32_e32 v8, s8, v8
+; SICI-NEXT: v_add_f32_e32 v9, s9, v9
+; SICI-NEXT: v_add_f32_e32 v10, s10, v10
+; SICI-NEXT: v_add_f32_e32 v11, s11, v11
+; SICI-NEXT: v_add_f32_e32 v12, s12, v12
+; SICI-NEXT: v_add_f32_e32 v13, s13, v13
+; SICI-NEXT: v_add_f32_e32 v14, s14, v14
+; SICI-NEXT: v_add_f32_e32 v15, s15, v15
+; SICI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_v16float:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_mov_b32 s2, s1
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x80
+; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v1, s17
+; VI-NEXT: v_mov_b32_e32 v2, s18
+; VI-NEXT: v_mov_b32_e32 v3, s19
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v5, s21
+; VI-NEXT: v_mov_b32_e32 v6, s22
+; VI-NEXT: v_mov_b32_e32 v7, s23
+; VI-NEXT: v_mov_b32_e32 v8, s24
+; VI-NEXT: v_mov_b32_e32 v9, s25
+; VI-NEXT: v_mov_b32_e32 v10, s26
+; VI-NEXT: v_mov_b32_e32 v11, s27
+; VI-NEXT: v_mov_b32_e32 v12, s28
+; VI-NEXT: v_mov_b32_e32 v13, s29
+; VI-NEXT: v_mov_b32_e32 v14, s30
+; VI-NEXT: v_mov_b32_e32 v15, s31
+; VI-NEXT: v_add_f32_e32 v0, s0, v0
+; VI-NEXT: v_add_f32_e32 v1, s1, v1
+; VI-NEXT: v_add_f32_e32 v2, s2, v2
+; VI-NEXT: v_add_f32_e32 v3, s3, v3
+; VI-NEXT: v_add_f32_e32 v4, s4, v4
+; VI-NEXT: v_add_f32_e32 v5, s5, v5
+; VI-NEXT: v_add_f32_e32 v6, s6, v6
+; VI-NEXT: v_add_f32_e32 v7, s7, v7
+; VI-NEXT: v_add_f32_e32 v8, s8, v8
+; VI-NEXT: v_add_f32_e32 v9, s9, v9
+; VI-NEXT: v_add_f32_e32 v10, s10, v10
+; VI-NEXT: v_add_f32_e32 v11, s11, v11
+; VI-NEXT: v_add_f32_e32 v12, s12, v12
+; VI-NEXT: v_add_f32_e32 v13, s13, v13
+; VI-NEXT: v_add_f32_e32 v14, s14, v14
+; VI-NEXT: v_add_f32_e32 v15, s15, v15
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_v16float:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s2, s1
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x80
+; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s36
+; GFX9-NEXT: v_mov_b32_e32 v1, s37
+; GFX9-NEXT: v_mov_b32_e32 v2, s38
+; GFX9-NEXT: v_mov_b32_e32 v3, s39
+; GFX9-NEXT: v_mov_b32_e32 v4, s40
+; GFX9-NEXT: v_mov_b32_e32 v5, s41
+; GFX9-NEXT: v_mov_b32_e32 v6, s42
+; GFX9-NEXT: v_mov_b32_e32 v7, s43
+; GFX9-NEXT: v_mov_b32_e32 v8, s44
+; GFX9-NEXT: v_mov_b32_e32 v9, s45
+; GFX9-NEXT: v_mov_b32_e32 v10, s46
+; GFX9-NEXT: v_mov_b32_e32 v11, s47
+; GFX9-NEXT: v_mov_b32_e32 v12, s48
+; GFX9-NEXT: v_mov_b32_e32 v13, s49
+; GFX9-NEXT: v_mov_b32_e32 v14, s50
+; GFX9-NEXT: v_mov_b32_e32 v15, s51
+; GFX9-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX9-NEXT: v_add_f32_e32 v1, s5, v1
+; GFX9-NEXT: v_add_f32_e32 v2, s6, v2
+; GFX9-NEXT: v_add_f32_e32 v3, s7, v3
+; GFX9-NEXT: v_add_f32_e32 v4, s8, v4
+; GFX9-NEXT: v_add_f32_e32 v5, s9, v5
+; GFX9-NEXT: v_add_f32_e32 v6, s10, v6
+; GFX9-NEXT: v_add_f32_e32 v7, s11, v7
+; GFX9-NEXT: v_add_f32_e32 v8, s12, v8
+; GFX9-NEXT: v_add_f32_e32 v9, s13, v9
+; GFX9-NEXT: v_add_f32_e32 v10, s14, v10
+; GFX9-NEXT: v_add_f32_e32 v11, s15, v11
+; GFX9-NEXT: v_add_f32_e32 v12, s16, v12
+; GFX9-NEXT: v_add_f32_e32 v13, s17, v13
+; GFX9-NEXT: v_add_f32_e32 v14, s18, v14
+; GFX9-NEXT: v_add_f32_e32 v15, s19, v15
+; GFX9-NEXT: ; return to shader part epilog
%gep1 = getelementptr inbounds <16 x float>, ptr addrspace(6) %p1, i32 2
%r0 = load <16 x float>, ptr addrspace(6) %p0
%r1 = load <16 x float>, ptr addrspace(6) %gep1
@@ -192,45 +751,107 @@ define amdgpu_vs <16 x float> @load_v16float(ptr addrspace(6) inreg %p0, ptr add
ret <16 x float> %r
}
-; GCN-LABEL: {{^}}load_i32_hi0:
-; GCN: s_mov_b32 s1, 0
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
define amdgpu_vs i32 @load_i32_hi0(ptr addrspace(6) inreg %p) #1 {
+; GCN-LABEL: load_i32_hi0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s1, 0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
%r0 = load i32, ptr addrspace(6) %p
ret i32 %r0
}
-; GCN-LABEL: {{^}}load_i32_hi1:
-; GCN: s_mov_b32 s1, 1
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
define amdgpu_vs i32 @load_i32_hi1(ptr addrspace(6) inreg %p) #2 {
+; GCN-LABEL: load_i32_hi1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s1, 1
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
%r0 = load i32, ptr addrspace(6) %p
ret i32 %r0
}
-; GCN-LABEL: {{^}}load_i32_hiffff8000:
-; GCN: s_movk_i32 s1, 0x8000
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
define amdgpu_vs i32 @load_i32_hiffff8000(ptr addrspace(6) inreg %p) #3 {
+; GCN-LABEL: load_i32_hiffff8000:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_movk_i32 s1, 0x8000
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
%r0 = load i32, ptr addrspace(6) %p
ret i32 %r0
}
-; GCN-LABEL: {{^}}load_i32_hifffffff0:
-; GCN: s_mov_b32 s1, -16
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
define amdgpu_vs i32 @load_i32_hifffffff0(ptr addrspace(6) inreg %p) #4 {
+; GCN-LABEL: load_i32_hifffffff0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s1, -16
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
%r0 = load i32, ptr addrspace(6) %p
ret i32 %r0
}
-; GCN-LABEL: {{^}}load_sampler
-; GCN: v_readfirstlane_b32
-; SI: s_nop
-; GCN: s_load_dwordx8
-; GCN-NEXT: s_load_dwordx4
-; GCN: image_sample
define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
+; SI-LABEL: load_sampler:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_mov_b64 s[6:7], exec
+; SI-NEXT: s_wqm_b64 exec, exec
+; SI-NEXT: s_mov_b32 m0, s5
+; SI-NEXT: v_interp_mov_f32 v0, p0, attr0.x
+; SI-NEXT: v_lshlrev_b32_e32 v0, 6, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s1, v0
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_mov_b32 s1, 0
+; SI-NEXT: s_nop 2
+; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_and_b64 exec, exec, s[6:7]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_sampler:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: s_mov_b64 s[6:7], exec
+; VI-NEXT: s_wqm_b64 exec, exec
+; VI-NEXT: s_mov_b32 m0, s5
+; VI-NEXT: v_interp_mov_f32_e32 v0, p0, attr0.x
+; VI-NEXT: v_lshlrev_b32_e32 v0, 6, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s1, v0
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x30
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_and_b64 exec, exec, s[6:7]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_sampler:
+; GFX9: ; %bb.0: ; %main_body
+; GFX9-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-NEXT: s_wqm_b64 exec, exec
+; GFX9-NEXT: s_mov_b32 m0, s5
+; GFX9-NEXT: s_mov_b32 s17, 0
+; GFX9-NEXT: v_interp_mov_f32_e32 v0, p0, attr0.x
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 6, s1
+; GFX9-NEXT: v_readfirstlane_b32 s16, v0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[16:17], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x30
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
main_body:
%22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
%23 = bitcast float %22 to i32
@@ -256,13 +877,63 @@ main_body:
ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42
}
-; GCN-LABEL: {{^}}load_sampler_nouniform
-; GCN: v_readfirstlane_b32
-; SI: s_nop
-; GCN: s_load_dwordx8
-; GCN-NEXT: s_load_dwordx4
-; GCN: image_sample
define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
+; SI-LABEL: load_sampler_nouniform:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_mov_b64 s[6:7], exec
+; SI-NEXT: s_wqm_b64 exec, exec
+; SI-NEXT: s_mov_b32 m0, s5
+; SI-NEXT: v_interp_mov_f32 v0, p0, attr0.x
+; SI-NEXT: v_lshlrev_b32_e32 v0, 6, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s1, v0
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_mov_b32 s1, 0
+; SI-NEXT: s_nop 2
+; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_and_b64 exec, exec, s[6:7]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: load_sampler_nouniform:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: s_mov_b64 s[6:7], exec
+; VI-NEXT: s_wqm_b64 exec, exec
+; VI-NEXT: s_mov_b32 m0, s5
+; VI-NEXT: v_interp_mov_f32_e32 v0, p0, attr0.x
+; VI-NEXT: v_lshlrev_b32_e32 v0, 6, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s1, v0
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x30
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_and_b64 exec, exec, s[6:7]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_sampler_nouniform:
+; GFX9: ; %bb.0: ; %main_body
+; GFX9-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-NEXT: s_wqm_b64 exec, exec
+; GFX9-NEXT: s_mov_b32 m0, s5
+; GFX9-NEXT: s_mov_b32 s17, 0
+; GFX9-NEXT: v_interp_mov_f32_e32 v0, p0, attr0.x
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 6, s1
+; GFX9-NEXT: v_readfirstlane_b32 s16, v0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[16:17], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x30
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
main_body:
%22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
%23 = bitcast float %22 to i32
@@ -288,22 +959,54 @@ main_body:
ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42
}
-; GCN-LABEL: {{^}}load_addr_no_fold:
-; GCN-DAG: s_add_i32 s0, s0, 4
-; GCN-DAG: s_mov_b32 s1, 0
-; GCN: s_load_dword s{{[0-9]}}, s[0:1], 0x0
define amdgpu_vs float @load_addr_no_fold(ptr addrspace(6) inreg noalias %p0) #0 {
+; GCN-LABEL: load_addr_no_fold:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s0, s0, 4
+; GCN-NEXT: s_mov_b32 s1, 0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
%gep1 = getelementptr i32, ptr addrspace(6) %p0, i32 1
%r1 = load i32, ptr addrspace(6) %gep1
%r2 = bitcast i32 %r1 to float
ret float %r2
}
-; GCN-LABEL: {{^}}vgpr_arg_src:
-; GCN: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0
-; GCN: s_mov_b32 s[[ZERO:[0-9]+]]
-; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ZERO]]]
define amdgpu_vs float @vgpr_arg_src(ptr addrspace(6) %arg) {
+; SI-LABEL: vgpr_arg_src:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_mov_b32 s1, 0
+; SI-NEXT: s_nop 2
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: vgpr_arg_src:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_nop 1
+; VI-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: vgpr_arg_src:
+; GFX9: ; %bb.0: ; %main_body
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 1
+; GFX9-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
main_body:
%tmp9 = load ptr addrspace(8), ptr addrspace(6) %arg
%tmp10 = call nsz float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %tmp9, i32 poison, i32 0, i32 0, i32 0) #1
@@ -329,3 +1032,5 @@ attributes #5 = { "InitialPSInputAddr"="45175" }
attributes #6 = { nounwind readnone speculatable }
attributes #7 = { nounwind memory(argmem: read) }
attributes #8 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SICIVI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
index ac9a279491668..0a9edbb311ace 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
@@ -10,18 +10,19 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
; RRLIST-NEXT: v_mov_b32_e32 v2, 0
; RRLIST-NEXT: s_waitcnt lgkmcnt(0)
; RRLIST-NEXT: s_load_dword s16, s[12:13], 0x0
-; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0
-; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0
; RRLIST-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x44
-; RRLIST-NEXT: s_load_dword s17, s[14:15], 0x0
+; RRLIST-NEXT: s_nop 0
+; RRLIST-NEXT: s_load_dword s8, s[14:15], 0x0
; RRLIST-NEXT: s_waitcnt lgkmcnt(0)
-; RRLIST-NEXT: s_min_i32 s8, s16, 0
-; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; RRLIST-NEXT: s_min_i32 s9, s16, 0
+; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; RRLIST-NEXT: s_and_b64 s[4:5], vcc, exec
-; RRLIST-NEXT: s_cselect_b32 s4, s16, s17
-; RRLIST-NEXT: s_cmp_eq_u64 s[2:3], s[0:1]
-; RRLIST-NEXT: s_cselect_b32 s0, s8, s4
+; RRLIST-NEXT: s_cselect_b32 s4, s16, s8
+; RRLIST-NEXT: s_cmp_eq_u64 s[0:1], s[2:3]
+; RRLIST-NEXT: s_cselect_b32 s0, s9, s4
; RRLIST-NEXT: v_mov_b32_e32 v0, s0
; RRLIST-NEXT: global_store_dword v2, v0, s[6:7]
; RRLIST-NEXT: s_endpgm
@@ -29,16 +30,16 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
; FAST-LABEL: sccClobber:
; FAST: ; %bb.0: ; %entry
; FAST-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; FAST-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x44
; FAST-NEXT: v_mov_b32_e32 v2, 0
; FAST-NEXT: s_waitcnt lgkmcnt(0)
-; FAST-NEXT: s_load_dword s16, s[12:13], 0x0
; FAST-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0
-; FAST-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
-; FAST-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x44
+; FAST-NEXT: s_load_dword s16, s[12:13], 0x0
; FAST-NEXT: s_load_dword s17, s[14:15], 0x0
+; FAST-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; FAST-NEXT: s_waitcnt lgkmcnt(0)
-; FAST-NEXT: s_min_i32 s8, s16, 0
; FAST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; FAST-NEXT: s_min_i32 s8, s16, 0
; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
; FAST-NEXT: s_and_b64 s[4:5], vcc, exec
; FAST-NEXT: s_cselect_b32 s4, s16, s17
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index fb418afb8b039..0ac6d5bffc218 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1205,15 +1205,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: flat_load_ushort v1, v[2:3]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0
+; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
@@ -1292,7 +1292,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
; SI-NEXT: .LBB14_4:
-; SI-NEXT: ; implicit-def: $vgpr0
+; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_branch .LBB14_2
;
; VI-LABEL: ctpop_i16_in_br:
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index d1090738e24a6..e81eb018e6a3e 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -2974,15 +2974,15 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s4, s[0:1], 0x0
; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2
; SI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1
; SI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_fma_f32 v0, s0, v0, 0.5
+; SI-NEXT: v_fma_f32 v0, s4, v0, 0.5
; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
@@ -2999,15 +2999,15 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
; VI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2
; VI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1
; VI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e32 v0, s0, v0
+; VI-NEXT: v_mul_f32_e32 v0, s4, v0
; VI-NEXT: v_add_f32_e32 v0, 0.5, v0
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -3024,17 +3024,16 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3
; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2
; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
; GFX10-NEXT: global_load_ubyte v4, v0, s[0:1]
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_fma_f32 v0, s0, v0, 0.5
+; GFX10-NEXT: v_fma_f32 v0, s2, v0, 0.5
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: global_store_byte v[0:1], v2, off
@@ -3073,17 +3072,17 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_u8 v1, v0, s[0:1] offset:3
; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] offset:2
; GFX11-NEXT: global_load_u8 v3, v0, s[0:1] offset:1
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fma_f32 v1, s0, v1, 0.5
+; GFX11-NEXT: v_fma_f32 v1, s2, v1, 0.5
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_clause 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 45fe2d07226a1..5ce299262805b 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -300,6 +300,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -595,6 +596,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 06c30dfd36033..271160ba73652 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1235,19 +1235,19 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving(
; CI-NEXT: v_add_i32_e32 v3, vcc, s1, v0
; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v1
; CI-NEXT: v_add_i32_e32 v6, vcc, s3, v0
-; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
-; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4
+; CI-NEXT: ds_read2_b32 v[0:1], v3 offset1:4
+; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt lgkmcnt(2)
-; CI-NEXT: v_mul_f32_e32 v0, v0, v2
+; CI-NEXT: v_mul_f32_e32 v0, v2, v0
; CI-NEXT: v_add_f32_e32 v0, 2.0, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mul_f32_e32 v2, v4, v6
; CI-NEXT: v_sub_f32_e32 v0, v0, v2
-; CI-NEXT: v_mul_f32_e32 v1, v1, v3
+; CI-NEXT: v_mul_f32_e32 v1, v3, v1
; CI-NEXT: v_sub_f32_e32 v0, v0, v1
; CI-NEXT: v_mul_f32_e32 v1, v5, v7
; CI-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1265,17 +1265,17 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving(
; GFX9-NEXT: v_add_u32_e32 v3, s1, v0
; GFX9-NEXT: v_add_u32_e32 v4, s2, v1
; GFX9-NEXT: v_add_u32_e32 v6, s3, v0
-; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
-; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4
+; GFX9-NEXT: ds_read2_b32 v[0:1], v3 offset1:4
+; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:4
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_mul_f32_e32 v0, v2, v0
; GFX9-NEXT: v_add_f32_e32 v0, 2.0, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v4, v6
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 1d83d33a4f832..b64324ab1d4b2 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -2521,6 +2521,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -2551,6 +2552,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -2581,6 +2583,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -2612,6 +2615,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -2716,6 +2720,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -2746,6 +2751,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -2776,6 +2782,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -2807,6 +2814,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -2910,6 +2918,7 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -2940,6 +2949,7 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -2970,6 +2980,7 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3001,6 +3012,7 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3104,6 +3116,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3134,6 +3147,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3164,6 +3178,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3195,6 +3210,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3299,6 +3315,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3329,6 +3346,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3359,6 +3377,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3390,6 +3409,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3494,6 +3514,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3524,6 +3545,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3554,6 +3576,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3585,6 +3608,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3689,6 +3713,7 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3719,6 +3744,7 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3749,6 +3775,7 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3780,6 +3807,7 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3884,6 +3912,7 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3914,6 +3943,7 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -3944,6 +3974,7 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -3975,6 +4006,7 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4078,6 +4110,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4108,6 +4141,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4138,6 +4172,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4169,6 +4204,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4272,6 +4308,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4302,6 +4339,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4332,6 +4370,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4363,6 +4402,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4467,6 +4507,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4497,6 +4538,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4527,6 +4569,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4558,6 +4601,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4661,6 +4705,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4691,6 +4736,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4721,6 +4767,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4752,6 +4799,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4855,6 +4903,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4885,6 +4934,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -4915,6 +4965,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -4946,6 +4997,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -5049,6 +5101,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -5079,6 +5132,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -5109,6 +5163,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -5140,6 +5195,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT: s_clause 0x1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index b4b9c2d3e0135..e8ff1b34f305b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -962,8 +962,9 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
-; GFX9-NEXT: s_brev_b32 s2, -2
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -976,13 +977,15 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[4:5]
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v2, s[4:5]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -1001,6 +1004,7 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1101,6 +1105,7 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1121,6 +1126,7 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v2, v1, s[2:3]
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[4:5]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -1202,10 +1208,11 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_brev_b32 s2, -2
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
@@ -1221,6 +1228,7 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[6:7]
; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1241,6 +1249,7 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v1, v1, s[6:7]
; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1342,6 +1351,7 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[6:7]
; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1362,6 +1372,7 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v2, v1, s[6:7]
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1444,10 +1455,11 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v1, s[6:7]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX9-NEXT: global_store_short v2, v0, s[0:1]
@@ -1463,6 +1475,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[6:7]
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1484,6 +1497,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[6:7]
; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1582,6 +1596,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1602,6 +1617,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1686,8 +1702,9 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v1, s[2:3]
-; GFX9-NEXT: s_movk_i32 s2, 0x7fff
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_ushort v0, v0, s[6:7]
+; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -1705,6 +1722,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1726,6 +1744,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[2:3]
; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[4:5]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index a96d022b66f12..6762277a4a651 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1974,8 +1974,8 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000
; SI-NOFMA-NEXT: s_mov_b32 s10, -1
-; SI-NOFMA-NEXT: s_mov_b32 s14, s10
-; SI-NOFMA-NEXT: s_mov_b32 s15, s11
+; SI-NOFMA-NEXT: s_mov_b32 s18, s10
+; SI-NOFMA-NEXT: s_mov_b32 s19, s11
; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOFMA-NEXT: s_mov_b32 s16, s4
; SI-NOFMA-NEXT: s_mov_b32 s17, s5
@@ -1983,22 +1983,21 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: s_mov_b32 s5, s7
; SI-NOFMA-NEXT: s_mov_b32 s6, s10
; SI-NOFMA-NEXT: s_mov_b32 s7, s11
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NOFMA-NEXT: s_mov_b32 s14, s10
; SI-NOFMA-NEXT: s_mov_b32 s12, s2
; SI-NOFMA-NEXT: s_mov_b32 s13, s3
-; SI-NOFMA-NEXT: s_mov_b32 s18, s10
-; SI-NOFMA-NEXT: s_mov_b32 s19, s11
-; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NOFMA-NEXT: s_mov_b32 s15, s11
; SI-NOFMA-NEXT: buffer_load_dword v2, off, s[12:15], 0
; SI-NOFMA-NEXT: s_mov_b32 s8, s0
; SI-NOFMA-NEXT: s_mov_b32 s9, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(2)
-; SI-NOFMA-NEXT: v_sub_f32_e32 v3, 1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v3
+; SI-NOFMA-NEXT: v_sub_f32_e32 v3, 1.0, v1
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v3
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v2, v0
-; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: v_mac_f32_e32 v0, v2, v1
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NOFMA-NEXT: s_endpgm
;
; SI-FMA-LABEL: test_f32_interp:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index db0c5362bdc5f..5ab09060403b3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8124,6 +8124,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_med3_f32 v2, v2, 1.0, 0x41800000
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8305,6 +8306,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s2, 0x41800000
+; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1]
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8326,6 +8328,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
; GFX11-GISEL-NEXT: v_med3_f32 v2, v3, 0x41000000, v2
; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1
+; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1]
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index c16fa2d40097d..9c900930c8ac0 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -286,15 +286,15 @@ define amdgpu_kernel void @fmul_v2f16(
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -305,9 +305,9 @@ define amdgpu_kernel void @fmul_v2f16(
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v2, v3, v2
+; SI-NEXT: v_mul_f32_e32 v2, v2, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -320,20 +320,20 @@ define amdgpu_kernel void @fmul_v2f16(
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_mov_b32 s15, s7
-; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_mul_f16_e32 v0, v1, v0
+; VI-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_mul_f16_e32 v0, v0, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -374,6 +374,7 @@ define amdgpu_kernel void @fmul_v2f16(
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s2
; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0
; GFX11-NEXT: s_mov_b32 s8, s0
@@ -586,15 +587,15 @@ define amdgpu_kernel void @fmul_v4f16(
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_mov_b32 s13, s11
-; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -611,10 +612,10 @@ define amdgpu_kernel void @fmul_v4f16(
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_mul_f32_e32 v5, v7, v5
-; SI-NEXT: v_mul_f32_e32 v4, v6, v4
-; SI-NEXT: v_mul_f32_e32 v1, v3, v1
-; SI-NEXT: v_mul_f32_e32 v0, v2, v0
+; SI-NEXT: v_mul_f32_e32 v5, v5, v7
+; SI-NEXT: v_mul_f32_e32 v4, v4, v6
+; SI-NEXT: v_mul_f32_e32 v1, v1, v3
+; SI-NEXT: v_mul_f32_e32 v0, v0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
@@ -632,22 +633,22 @@ define amdgpu_kernel void @fmul_v4f16(
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_mov_b32 s15, s7
-; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_mul_f16_e32 v1, v3, v1
-; VI-NEXT: v_mul_f16_sdwa v3, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_mul_f16_e32 v0, v2, v0
+; VI-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_mul_f16_e32 v1, v1, v3
+; VI-NEXT: v_mul_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NEXT: v_or_b32_e32 v0, v0, v3
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -659,20 +660,20 @@ define amdgpu_kernel void @fmul_v4f16(
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_mov_b32 s14, s6
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s14, s6
; GFX9-NEXT: s_mov_b32 s15, s7
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; GFX9-NEXT: s_mov_b32 s10, s6
+; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1
-; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
@@ -683,20 +684,21 @@ define amdgpu_kernel void @fmul_v4f16(
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: s_mov_b32 s10, -1
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, s10
-; GFX11-NEXT: s_mov_b32 s7, s11
; GFX11-NEXT: s_mov_b32 s14, s10
; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s6, s10
+; GFX11-NEXT: s_mov_b32 s7, s11
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s2
; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0
+; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[4:7], 0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1
-; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0
+; GFX11-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX11-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 125d009429cbf..8b28fe04d2205 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -337,21 +337,21 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI-NEXT: s_mov_b32 s11, 0xf000
; CI-NEXT: s_mov_b32 s10, -1
; CI-NEXT: s_mov_b32 s6, s10
-; CI-NEXT: s_mov_b32 s7, s11
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8
; CI-NEXT: s_mov_b32 s8, s0
; CI-NEXT: s_mov_b32 s9, s1
; CI-NEXT: s_mov_b32 s0, s2
; CI-NEXT: s_mov_b32 s1, s3
; CI-NEXT: s_mov_b32 s2, s10
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: s_mov_b32 s7, s11
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8
; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_rcp_f32_e32 v2, v1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_mul_f32_e32 v2, v0, v2
; CI-NEXT: v_trunc_f32_e32 v2, v2
; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
@@ -522,21 +522,21 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; CI-NEXT: s_mov_b32 s11, 0xf000
; CI-NEXT: s_mov_b32 s10, -1
; CI-NEXT: s_mov_b32 s6, s10
-; CI-NEXT: s_mov_b32 s7, s11
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8
; CI-NEXT: s_mov_b32 s8, s0
; CI-NEXT: s_mov_b32 s9, s1
; CI-NEXT: s_mov_b32 s0, s2
; CI-NEXT: s_mov_b32 s1, s3
; CI-NEXT: s_mov_b32 s2, s10
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: s_mov_b32 s7, s11
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8
; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_rcp_f32_e32 v2, v1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_mul_f32_e32 v2, v0, v2
; CI-NEXT: v_trunc_f32_e32 v2, v2
; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
@@ -2596,16 +2596,16 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
; VI-NEXT: v_cvt_f32_f16_e32 v9, v8
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
; VI-NEXT: v_rcp_f32_e32 v10, v9
; VI-NEXT: v_mul_f32_e32 v11, v7, v10
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index a764681645c42..95baeb64ca0de 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -286,15 +286,15 @@ define amdgpu_kernel void @fsub_v2f16(
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -305,9 +305,9 @@ define amdgpu_kernel void @fsub_v2f16(
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_sub_f32_e32 v2, v3, v2
+; SI-NEXT: v_sub_f32_e32 v2, v2, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_sub_f32_e32 v0, v1, v0
+; SI-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -320,20 +320,20 @@ define amdgpu_kernel void @fsub_v2f16(
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_mov_b32 s15, s7
-; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_sub_f16_e32 v0, v1, v0
+; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_sub_f16_e32 v0, v0, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -374,6 +374,7 @@ define amdgpu_kernel void @fsub_v2f16(
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s2
; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0
; GFX11-NEXT: s_mov_b32 s8, s0
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 0db2a1679197e..0795d0c36952d 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -543,25 +543,25 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25
-; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27
-; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
-; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25
+; GFX11-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v9, s27
+; GFX11-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21
+; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off
; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[24:27], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <32 x i32> %arg0, ptr addrspace(1) poison
ret void
@@ -779,25 +779,25 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25
-; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27
-; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
-; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25
+; GFX11-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v9, s27
+; GFX11-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21
+; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off
; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[24:27], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <16 x i64> %arg0, ptr addrspace(1) poison
ret void
@@ -1243,25 +1243,25 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25
-; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27
-; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
-; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25
+; GFX11-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v9, s27
+; GFX11-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21
+; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off
-; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off
; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[24:27], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <16 x double> %arg0, ptr addrspace(1) poison
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 81b8b36180746..bfa50b42881c0 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2922,8 +2922,8 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:8
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
@@ -2944,7 +2944,7 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3185,9 +3185,9 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
@@ -3208,7 +3208,7 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3334,10 +3334,10 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:16
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
@@ -3358,15 +3358,14 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: buffer_store_b32 v35, off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: buffer_store_b32 v36, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b32 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3422,14 +3421,14 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:20
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt vmcnt(8)
@@ -3449,11 +3448,11 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
+; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
store volatile <32 x i32> %arg0, ptr addrspace(1) poison
@@ -3505,13 +3504,13 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:20
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
@@ -3532,7 +3531,7 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3698,26 +3697,26 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x10
-; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:48
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: s_waitcnt vmcnt(13)
; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -3734,15 +3733,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3980,39 +3978,39 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:124
-; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:84
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:108
+; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:104
+; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:100
+; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:128
+; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:124
+; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:120
+; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:112
+; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:116
+; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:96
+; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:92
+; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:88
+; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:84
+; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt vmcnt(15)
@@ -4033,26 +4031,22 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: buffer_store_b128 v[80:83], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: buffer_store_b128 v[68:71], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: buffer_store_b128 v[64:67], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index ca9cb456fa19f..e960827aa3a06 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -15791,6 +15791,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s25, 21
; GFX11-NEXT: s_mov_b32 s24, s40
; GFX11-NEXT: s_mov_b32 s25, s41
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_writelane_b32 v40, s26, 22
@@ -16232,6 +16233,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: s_mov_b32 s25, s41
; GFX11-NEXT: v_writelane_b32 v40, s26, 22
; GFX11-NEXT: s_mov_b32 s26, s42
+; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b32 off, v6, s2
; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
@@ -16960,6 +16962,7 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
@@ -17248,6 +17251,7 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0
; GFX11-NEXT: v_mov_b32_e32 v6, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 6384fdba7a45a..d14d306baffe6 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -2529,74 +2529,72 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164
-; GFX11-NEXT: s_clause 0x11
-; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT: s_clause 0x13
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:76
+; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:96
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:92
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:88
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:108
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:104
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:100
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104
+; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:128
+; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:124
+; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:120
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124
-; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:140
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:136
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:132
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136
+; GFX11-NEXT: s_clause 0x8
+; GFX11-NEXT: scratch_load_b32 v16, off, s32 offset:160
+; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:156
+; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:152
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:144
+; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:148
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:112
+; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:116
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:80
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT: s_clause 0xd
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v16, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v10, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v9, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v32, off, s32
; GFX11-NEXT: s_waitcnt vmcnt(10)
-; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off offset:272
-; GFX11-NEXT: s_waitcnt vmcnt(9)
-; GFX11-NEXT: scratch_store_b128 v0, v[12:15], off offset:256
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:272
+; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off offset:256
; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: scratch_store_b128 v0, v[16:19], off offset:240
-; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: scratch_store_b128 v0, v[20:23], off offset:224
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b128 v0, v[20:23], off offset:240
+; GFX11-NEXT: scratch_store_b128 v0, v[56:59], off offset:224
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: scratch_store_b128 v0, v[56:59], off offset:208
-; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: scratch_store_b128 v0, v[41:44], off offset:192
-; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: scratch_store_b128 v0, v[37:40], off offset:176
-; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_store_b128 v0, v[41:44], off offset:208
+; GFX11-NEXT: scratch_store_b128 v0, v[37:40], off offset:192
+; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:176
+; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:160
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144
+; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:144
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3211,25 +3209,30 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: scratch_store_b32 off, v59, s33
; GFX11-NEXT: s_add_i32 s0, s32, 0xa0
; GFX11-NEXT: s_add_i32 s1, s32, 0x90
+; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b32 off, v4, s0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
; GFX11-NEXT: s_add_i32 s0, s32, 0x80
; GFX11-NEXT: s_add_i32 s1, s32, 0x70
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
; GFX11-NEXT: s_add_i32 s0, s32, 0x60
; GFX11-NEXT: s_add_i32 s1, s32, 0x50
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: s_add_i32 s1, s32, 48
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
; GFX11-NEXT: s_add_i32 s0, s32, 32
; GFX11-NEXT: s_add_i32 s1, s32, 16
; GFX11-NEXT: s_add_i32 s2, s33, 0x200
; GFX11-NEXT: v_writelane_b32 v60, s30, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
@@ -3288,7 +3291,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536
; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32
; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v32, v36
; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 82c58394c03bb..7c3191a7e1f20 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -743,15 +743,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1961,15 +1961,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -3239,15 +3239,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4013,15 +4013,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -5316,15 +5316,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -6096,12 +6096,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_getpc_b64 s[0:1]
; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
@@ -6111,7 +6108,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_mov_b32 s12, s51
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -6166,10 +6167,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -6178,7 +6177,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -6555,12 +6557,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
@@ -6570,7 +6569,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s12, s51
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -6625,10 +6628,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -6637,7 +6638,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s12, s51
; GFX1132-DPP-NEXT: s_mov_b32 s13, s50
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -7149,12 +7153,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: s_getpc_b64 s[0:1]
; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
@@ -7164,7 +7165,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: s_mov_b32 s12, s51
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -7245,10 +7250,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -7257,7 +7260,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -7288,7 +7294,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
@@ -7296,6 +7301,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -7830,12 +7836,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
@@ -7845,7 +7848,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s12, s51
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -7945,10 +7952,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -7957,7 +7962,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s12, s51
; GFX1132-DPP-NEXT: s_mov_b32 s13, s50
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8922,15 +8930,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -10355,15 +10363,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -11270,15 +11278,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -12130,12 +12138,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-NEXT: s_getpc_b64 s[0:1]
; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
@@ -12145,7 +12150,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-NEXT: s_mov_b32 s12, s51
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -12206,10 +12215,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -12218,7 +12225,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -12605,12 +12615,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
@@ -12620,7 +12627,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s12, s51
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12681,10 +12692,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -12693,7 +12702,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s12, s51
; GFX1132-DPP-NEXT: s_mov_b32 s13, s50
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -13205,12 +13217,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-NEXT: s_getpc_b64 s[0:1]
; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
@@ -13220,7 +13229,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-NEXT: s_mov_b32 s12, s51
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -13301,10 +13314,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -13313,7 +13324,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -13344,7 +13358,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
@@ -13352,6 +13365,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -13886,12 +13900,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
@@ -13901,7 +13912,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s12, s51
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14001,10 +14016,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -14013,7 +14026,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s12, s51
; GFX1132-DPP-NEXT: s_mov_b32 s13, s50
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index f8f911b693e09..2a640dda13ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -645,15 +645,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1684,15 +1684,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -2723,15 +2723,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -3490,10 +3490,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -3555,9 +3556,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_mov_b32 s14, s33
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -3930,10 +3932,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT: s_clause 0x1
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -3995,9 +3998,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -4541,12 +4545,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off
@@ -4639,13 +4644,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v2, s52
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off
-; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 8
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off
@@ -4675,7 +4682,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
@@ -4683,6 +4689,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -5267,10 +5274,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
+; GFX1164-DPP-NEXT: s_clause 0x1
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5388,9 +5396,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
+; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -6170,15 +6179,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -7036,10 +7045,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -7101,9 +7111,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_mov_b32 s14, s33
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -7476,10 +7487,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT: s_clause 0x1
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -7541,9 +7553,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8087,12 +8100,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off
@@ -8185,13 +8199,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v2, s52
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off
-; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 8
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off
@@ -8221,7 +8237,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
@@ -8229,6 +8244,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -8813,10 +8829,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
+; GFX1164-DPP-NEXT: s_clause 0x1
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8934,9 +8951,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
+; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 1f76a476107a3..dab2066706b5a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -645,15 +645,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1684,15 +1684,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -2723,15 +2723,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -3490,10 +3490,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -3555,9 +3556,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_mov_b32 s14, s33
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -3930,10 +3932,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT: s_clause 0x1
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -3995,9 +3998,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -4541,12 +4545,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off
@@ -4639,13 +4644,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v2, s52
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off
-; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 8
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off
@@ -4675,7 +4682,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
@@ -4683,6 +4689,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -5267,10 +5274,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
+; GFX1164-DPP-NEXT: s_clause 0x1
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5388,9 +5396,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
+; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -6170,15 +6179,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -7036,10 +7045,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -7101,9 +7111,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_mov_b32 s14, s33
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -7476,10 +7487,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT: s_clause 0x1
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -7541,9 +7553,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8087,12 +8100,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off
@@ -8185,13 +8199,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v2, s52
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off
-; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 8
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off
@@ -8221,7 +8237,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
@@ -8229,6 +8244,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -8813,10 +8829,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
+; GFX1164-DPP-NEXT: s_clause 0x1
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8934,9 +8951,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
+; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 9db3c37045ccf..f9cab76900385 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -829,15 +829,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -2159,15 +2159,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -3489,15 +3489,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4315,15 +4315,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -5644,15 +5644,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -6424,12 +6424,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_getpc_b64 s[0:1]
; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
@@ -6439,7 +6436,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_mov_b32 s12, s51
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -6494,10 +6495,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -6506,7 +6505,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -6883,12 +6885,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
@@ -6898,7 +6897,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s12, s51
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -6953,10 +6956,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -6965,7 +6966,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s12, s51
; GFX1132-DPP-NEXT: s_mov_b32 s13, s50
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -7477,12 +7481,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: s_getpc_b64 s[0:1]
; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
@@ -7492,7 +7493,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: s_mov_b32 s12, s51
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -7573,10 +7578,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -7585,7 +7588,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -7616,7 +7622,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
@@ -7624,6 +7629,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -8158,12 +8164,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
@@ -8173,7 +8176,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s12, s51
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8273,10 +8280,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -8285,7 +8290,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s12, s51
; GFX1132-DPP-NEXT: s_mov_b32 s13, s50
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -9249,15 +9257,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -10682,15 +10690,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -11597,15 +11605,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7]
+; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
@@ -12456,12 +12464,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-NEXT: s_getpc_b64 s[0:1]
; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
@@ -12471,7 +12476,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-NEXT: s_mov_b32 s12, s51
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -12532,10 +12541,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -12544,7 +12551,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -12931,12 +12941,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
@@ -12946,7 +12953,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s12, s51
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13007,10 +13018,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -13019,7 +13028,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s12, s51
; GFX1132-DPP-NEXT: s_mov_b32 s13, s50
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -13531,12 +13543,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-NEXT: s_getpc_b64 s[0:1]
; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
@@ -13546,7 +13555,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-NEXT: s_mov_b32 s12, s51
; GFX1164-NEXT: s_mov_b32 s13, s50
; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -13627,10 +13640,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -13639,7 +13650,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: s_mov_b32 s12, s51
; GFX1132-NEXT: s_mov_b32 s13, s50
; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -13670,7 +13684,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1]
-; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000
; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
@@ -13678,6 +13691,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value at gotpcrel32@lo+4
; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9
; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -14212,12 +14226,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
@@ -14227,7 +14238,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s12, s51
; GFX1164-DPP-NEXT: s_mov_b32 s13, s50
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14327,10 +14342,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -14339,7 +14352,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s12, s51
; GFX1132-DPP-NEXT: s_mov_b32 s13, s50
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index 80d4fa69be425..8eae0db8a577a 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -9,10 +9,11 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: s_mov_b32 m0, s4
; GFX11-NEXT: s_getpc_b64 s[4:5]
; GFX11-NEXT: s_mov_b32 s0, s1
-; GFX11-NEXT: s_mov_b32 s6, s3
; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: s_mov_b32 s6, s3
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s7, s5
+; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
; GFX11-NEXT: s_load_b256 s[0:7], s[6:7], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 56ceba258f471..da0db4d1cd1fb 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -22,36 +22,35 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v5, s52, 10
; CHECK-NEXT: v_writelane_b32 v5, s53, 11
; CHECK-NEXT: v_writelane_b32 v5, s54, 12
-; CHECK-NEXT: v_writelane_b32 v5, s55, 13
; CHECK-NEXT: s_getpc_b64 s[24:25]
-; CHECK-NEXT: v_writelane_b32 v5, s64, 14
+; CHECK-NEXT: v_writelane_b32 v5, s55, 13
; CHECK-NEXT: s_movk_i32 s4, 0xf0
; CHECK-NEXT: s_mov_b32 s5, s24
-; CHECK-NEXT: v_writelane_b32 v5, s65, 15
+; CHECK-NEXT: v_writelane_b32 v5, s64, 14
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
-; CHECK-NEXT: v_writelane_b32 v5, s66, 16
+; CHECK-NEXT: v_writelane_b32 v5, s65, 15
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v5, s67, 17
+; CHECK-NEXT: v_writelane_b32 v5, s66, 16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_movk_i32 s6, 0x130
; CHECK-NEXT: s_mov_b32 s7, s24
-; CHECK-NEXT: v_writelane_b32 v5, s68, 18
+; CHECK-NEXT: v_writelane_b32 v5, s67, 17
; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
+; CHECK-NEXT: v_writelane_b32 v5, s68, 18
; CHECK-NEXT: v_writelane_b32 v5, s69, 19
; CHECK-NEXT: v_writelane_b32 v5, s70, 20
; CHECK-NEXT: s_mov_b32 s68, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_writelane_b32 v5, s71, 21
-; CHECK-NEXT: v_mov_b32_e32 v2, s4
-; CHECK-NEXT: v_mov_b32_e32 v3, v1
+; CHECK-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-NEXT: v_mov_b32_e32 v4, v1
; CHECK-NEXT: s_mov_b32 s69, s68
; CHECK-NEXT: s_mov_b32 s70, s68
; CHECK-NEXT: s_mov_b32 s71, s68
-; CHECK-NEXT: image_sample_lz v3, v[2:3], s[16:23], s[68:71] dmask:0x1
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT: s_mov_b32 s6, 48
+; CHECK-NEXT: image_sample_lz v3, v[3:4], s[16:23], s[68:71] dmask:0x1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v6, s36, 0
; CHECK-NEXT: v_writelane_b32 v6, s37, 1
@@ -69,6 +68,7 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v6, s48, 12
; CHECK-NEXT: v_writelane_b32 v6, s49, 13
; CHECK-NEXT: v_writelane_b32 v6, s50, 14
+; CHECK-NEXT: s_mov_b32 s6, 48
; CHECK-NEXT: s_movk_i32 s56, 0x1f0
; CHECK-NEXT: s_movk_i32 s72, 0x2f0
; CHECK-NEXT: s_mov_b32 s57, s24
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index b443e654350c5..b064689f25c9d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -22,9 +22,9 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -47,21 +47,21 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -73,13 +73,13 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -91,12 +91,12 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -108,13 +108,13 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -161,9 +161,9 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -187,18 +187,17 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
@@ -212,13 +211,13 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -230,13 +229,13 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -250,13 +249,13 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0
@@ -302,9 +301,9 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -327,21 +326,21 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -353,13 +352,13 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -371,12 +370,12 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -388,13 +387,13 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -438,9 +437,9 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -463,21 +462,21 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -489,13 +488,13 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -507,13 +506,13 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -527,13 +526,13 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -580,9 +579,9 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -605,21 +604,21 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -653,12 +652,12 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -670,13 +669,13 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -720,9 +719,9 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -745,21 +744,21 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
+; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -771,13 +770,13 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -789,13 +788,13 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -809,13 +808,13 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -862,9 +861,9 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
@@ -885,19 +884,19 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1055,13 +1054,13 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1073,12 +1072,12 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1090,13 +1089,13 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1140,9 +1139,9 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
@@ -1170,20 +1169,20 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1195,13 +1194,13 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] offset:4
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1213,12 +1212,12 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1230,13 +1229,13 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1332,13 +1331,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0
; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7]
@@ -1350,13 +1349,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
+; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0
; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7]
@@ -1370,13 +1369,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
+; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -1475,13 +1474,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0
; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7]
@@ -1493,13 +1492,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
+; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0
; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7]
@@ -1513,13 +1512,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
+; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -1566,9 +1565,9 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -1591,21 +1590,21 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1617,13 +1616,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1635,13 +1634,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1655,13 +1654,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -1708,9 +1707,9 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -1734,21 +1733,21 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1761,16 +1760,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
@@ -1781,16 +1780,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -1861,9 +1860,9 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -1887,21 +1886,21 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_i32_i24 v1, v3, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1914,16 +1913,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
@@ -1934,16 +1933,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -2014,9 +2013,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -2040,22 +2039,22 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v4, v3, v2, s0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, v4
+; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2173,9 +2172,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -2199,22 +2198,22 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
-; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_i32_i24 v4, v3, v2, s0
+; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, v4
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2332,9 +2331,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -2358,22 +2357,22 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_u32_u24 v4, v1, v0, s0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, v4
+; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2385,17 +2384,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v1, v2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
@@ -2406,17 +2405,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v1, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -2489,9 +2488,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -2515,22 +2514,22 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
-; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_i32_i24 v4, v1, v0, s0
+; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, v4
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2542,17 +2541,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
-; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
+; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v1, v2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
@@ -2563,17 +2562,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v1, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -2645,19 +2644,18 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s6, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -2671,22 +2669,22 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ushort v4, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
+; GFX8-NEXT: flat_load_ushort v2, v[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: v_mad_u16 v2, v3, v6, v2
+; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
+; GFX8-NEXT: flat_store_short v[4:5], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot2_acc16:
@@ -2737,7 +2735,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: s_clause 0x2
; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7]
@@ -2787,9 +2785,9 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
@@ -2812,23 +2810,23 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: flat_load_ushort v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3
-; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
-; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s0
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2840,15 +2838,15 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index d28f0a190e117..3491785a9c5dc 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -21,19 +21,19 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, s4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, s8
; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v6, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
@@ -52,27 +52,27 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
-; GFX8-NEXT: v_bfe_i32 v6, v3, 16, 8
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8
+; GFX8-NEXT: v_bfe_i32 v4, v0, 8, 8
+; GFX8-NEXT: v_bfe_i32 v5, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
-; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0
+; GFX8-NEXT: v_bfe_i32 v6, v0, 16, 8
+; GFX8-NEXT: v_bfe_i32 v7, v1, 16, 8
+; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
-; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v6, v7, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -84,15 +84,15 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
@@ -105,12 +105,12 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -123,14 +123,14 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v2, v1
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -144,11 +144,11 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -205,25 +205,24 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
+; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0
@@ -245,35 +244,35 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ushort v4, v[0:1]
+; GFX8-NEXT: flat_load_ushort v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
-; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT: v_bfe_i32 v7, v4, 0, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v8, v2, 0, 8
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 8
; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v4, v7, v8, v4
+; GFX8-NEXT: v_mad_u16 v3, v7, v8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX8-NEXT: v_mad_u16 v4, v9, v10, v4
-; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX8-NEXT: v_mad_u16 v3, v9, v10, v3
+; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u16 v3, v5, v6, v3
+; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -318,11 +317,11 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX9-DL-NEXT: global_load_sshort v4, v1, s[6:7]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v2, v3, v4
+; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v3, v2, v4
; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -334,12 +333,12 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT: s_clause 0x2
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX10-DL-NEXT: global_load_sshort v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v2, v3
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v3, v2
; GFX10-DL-NEXT: global_store_short v1, v4, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -352,12 +351,12 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: s_clause 0x2
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: global_load_i16 v3, v1, s[4:5]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, v3 neg_lo:[1,1,0]
; GFX11-DL-NEXT: global_store_b16 v1, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -405,36 +404,35 @@ entry:
define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc8:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s7
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s6, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
-; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3
; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc8:
@@ -446,25 +444,25 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT: flat_load_ubyte v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4
; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2
; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
@@ -503,11 +501,11 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -519,12 +517,12 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT: s_clause 0x2
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -537,12 +535,12 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: s_clause 0x2
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -592,20 +590,20 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v8, v1, v5, s4
+; GFX7-NEXT: v_mad_i32_i24 v8, v1, v5, s8
; GFX7-NEXT: v_mad_i32_i24 v3, v3, v6, v8
-; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
@@ -624,28 +622,28 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
-; GFX8-NEXT: v_bfe_i32 v6, v3, 16, 8
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8
+; GFX8-NEXT: v_bfe_i32 v4, v0, 8, 8
+; GFX8-NEXT: v_bfe_i32 v5, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s0
+; GFX8-NEXT: v_mad_i32_i24 v8, v2, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v8
-; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v4
+; GFX8-NEXT: v_bfe_i32 v6, v0, 16, 8
+; GFX8-NEXT: v_bfe_i32 v7, v1, 16, 8
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
-; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v6, v7, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -799,9 +797,9 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 24, v2
@@ -814,7 +812,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
+; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s8
; GFX7-NEXT: v_mad_i32_i24 v0, v4, v7, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v6, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v1, v5, v0
@@ -830,27 +828,27 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3
-; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 8
-; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v0
-; GFX8-NEXT: v_ashrrev_i32_e32 v6, 24, v0
-; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v0
+; GFX8-NEXT: v_bfe_i32 v5, v0, 16, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX8-NEXT: v_ashrrev_i32_e32 v6, 24, v1
+; GFX8-NEXT: v_bfe_i32 v7, v1, 16, 8
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0
-; GFX8-NEXT: v_mad_i32_i24 v0, v1, v2, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, s0
+; GFX8-NEXT: v_mad_i32_i24 v0, v2, v3, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v5, v7, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v4, v6, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
@@ -888,12 +886,12 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -906,14 +904,14 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v2, v1
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -927,11 +925,11 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -974,17 +972,17 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8
-; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8
+; GFX7-NEXT: v_bfe_i32 v1, v2, 16, 8
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8
@@ -993,14 +991,13 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v6, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -1014,26 +1011,26 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ushort v4, v[0:1]
+; GFX8-NEXT: flat_load_ushort v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_ashrrev_i16_e32 v7, 8, v3
-; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX8-NEXT: v_ashrrev_i16_e32 v9, 8, v5
-; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_ashrrev_i16_e32 v7, 8, v4
+; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8
; GFX8-NEXT: v_ashrrev_i16_e32 v8, 8, v2
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3
+; GFX8-NEXT: v_ashrrev_i16_e32 v9, 8, v5
+; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX8-NEXT: v_ashrrev_i16_e32 v10, 8, v6
; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8
; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2
@@ -1164,7 +1161,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-TRUE16-NEXT: s_clause 0x1
+; GFX11-DL-TRUE16-NEXT: s_clause 0x2
; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1]
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5]
@@ -1209,7 +1206,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-FAKE16-NEXT: s_clause 0x1
+; GFX11-DL-FAKE16-NEXT: s_clause 0x2
; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-DL-FAKE16-NEXT: global_load_u16 v3, v2, s[4:5]
@@ -1287,9 +1284,9 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
@@ -1312,21 +1309,21 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8
+; GFX8-NEXT: v_bfe_i32 v1, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0
+; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1338,13 +1335,13 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1457,19 +1454,19 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v4, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
-; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
@@ -1485,24 +1482,24 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
-; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8
+; GFX8-NEXT: v_bfe_i32 v4, v0, 8, 8
+; GFX8-NEXT: v_bfe_i32 v5, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_bfe_i32 v1, v1, 16, 8
+; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1644,19 +1641,19 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 24, v2
; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
+; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_ashrrev_i32_e32 v4, 24, v0
; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
-; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
@@ -1672,24 +1669,24 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v3
-; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v0
-; GFX8-NEXT: v_bfe_i32 v5, v0, 0, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v1
+; GFX8-NEXT: v_bfe_i32 v4, v0, 0, 8
+; GFX8-NEXT: v_bfe_i32 v5, v1, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_bfe_i32 v1, v1, 16, 8
+; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1859,25 +1856,25 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v7, v3, 16, 8
+; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v5, v2, 0, 8
-; GFX8-NEXT: v_mul_i32_i24_sdwa v6, sext(v3), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT: v_mul_i32_i24_sdwa v6, sext(v4), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT: v_bfe_i32 v7, v4, 16, 8
; GFX8-NEXT: v_bfe_i32 v8, v2, 16, 8
-; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v6
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3
+; GFX8-NEXT: v_mad_i32_i24 v3, v3, v5, v6
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v2
-; GFX8-NEXT: v_mad_i32_i24 v4, v7, v8, v4
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_i32_i24 v3, v7, v8, v3
+; GFX8-NEXT: v_mad_i32_i24 v2, v4, v2, v3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2001,21 +1998,21 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v1, s0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8
@@ -2030,36 +2027,36 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3src:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT: v_bfe_i32 v1, v5, 0, 8
+; GFX8-NEXT: v_bfe_i32 v3, v5, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_bfe_i32 v2, v2, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v1, v1, v1, s0
-; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 8
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v2, v4, v1
+; GFX8-NEXT: v_bfe_i32 v4, v5, 16, 8
+; GFX8-NEXT: v_mad_i32_i24 v1, v3, v2, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v5, 24, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v6, v0, 16, 8
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
-; GFX8-NEXT: v_mad_i32_i24 v1, v5, v6, v1
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_i32_i24 v1, v4, v6, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v5, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2210,21 +2207,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v1, s0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
@@ -2236,33 +2233,33 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3src_3ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT: v_bfe_i32 v1, v5, 0, 8
+; GFX8-NEXT: v_bfe_i32 v3, v5, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_bfe_i32 v2, v2, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v1, v1, v1, s0
-; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v2, v4, v1
+; GFX8-NEXT: v_bfe_i32 v4, v5, 16, 8
+; GFX8-NEXT: v_mad_i32_i24 v1, v3, v2, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v4, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2402,6 +2399,7 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dword s12, s[4:5], 0xf
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s10, 0
; GFX7-NEXT: s_mov_b32 s11, s7
@@ -2410,21 +2408,19 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; GFX7-NEXT: s_sext_i32_i16 s1, s12
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8
+; GFX7-NEXT: v_mad_i32_i24 v1, v3, s1, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8
-; GFX7-NEXT: v_mad_i32_i24 v1, v3, s1, v1
; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v4, v5, v1
@@ -2627,20 +2623,20 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v4, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
-; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -2655,24 +2651,24 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
-; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8
+; GFX8-NEXT: v_bfe_i32 v4, v0, 8, 8
+; GFX8-NEXT: v_bfe_i32 v5, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0
+; GFX8-NEXT: v_bfe_i32 v1, v1, 16, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2817,11 +2813,11 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
@@ -2842,32 +2838,32 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3src_3ele_src0:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT: v_bfe_i32 v3, v5, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v4, 8, 8
-; GFX8-NEXT: v_bfe_i32 v3, v4, 16, 8
+; GFX8-NEXT: v_bfe_i32 v1, v2, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v4, v1, v1, s0
-; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v4
+; GFX8-NEXT: v_bfe_i32 v2, v2, 16, 8
+; GFX8-NEXT: v_mad_i32_i24 v1, v3, v1, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3008,32 +3004,31 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_4src:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s18, 0
-; GFX7-NEXT: s_mov_b32 s19, s3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11]
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[16:17], s[12:13]
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[16:17], s[14:15]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[12:13]
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[14:15]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s8
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v5, v3, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v2, v4, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v4, 8, 8
@@ -3049,42 +3044,42 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_4src:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v6
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s11
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s13
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s12, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s10, v6
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s13
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s12, v6
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: flat_load_dword v3, v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v1, s15
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v6
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
-; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_bfe_i32 v1, v7, 0, 8
+; GFX8-NEXT: v_bfe_i32 v4, v7, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v3, v4, v1
+; GFX8-NEXT: v_bfe_i32 v5, v2, 0, 8
+; GFX8-NEXT: v_bfe_i32 v2, v2, 8, 8
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v4, s2
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v6, v5, 0, 8
-; GFX8-NEXT: v_bfe_i32 v5, v5, 8, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v6, v5, v1
+; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8
+; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8
+; GFX8-NEXT: v_mad_i32_i24 v1, v5, v2, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8
+; GFX8-NEXT: v_mad_i32_i24 v1, v6, v3, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v7, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -3297,34 +3292,34 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
-; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v4), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8
; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_and_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6
-; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6
+; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX8-NEXT: v_mad_u16 v3, v3, v5, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 82d62910bcb00..84f9c6c7ea5c7 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -21,19 +21,19 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4
-; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s8
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
@@ -52,27 +52,27 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8
+; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8
+; GFX8-NEXT: v_bfe_u32 v7, v1, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -84,15 +84,15 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
@@ -105,12 +105,12 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -122,13 +122,13 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -142,11 +142,11 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -195,36 +195,35 @@ entry:
define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_acc16:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s7
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s6, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
-; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3
; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_acc16:
@@ -237,31 +236,31 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ushort v4, v[0:1]
+; GFX8-NEXT: flat_load_ushort v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
-; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4
+; GFX8-NEXT: v_mad_u16 v3, v6, v7, v3
+; GFX8-NEXT: v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4
+; GFX8-NEXT: v_mad_u16 v3, v8, v9, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u16 v3, v10, v5, v3
+; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -303,11 +302,11 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX9-DL-NEXT: global_load_ushort v4, v1, s[6:7]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -319,12 +318,12 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT: s_clause 0x2
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -337,12 +336,12 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: s_clause 0x2
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: global_load_u16 v3, v1, s[4:5]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
; GFX11-DL-NEXT: global_store_b16 v1, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -391,36 +390,35 @@ entry:
define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_acc8:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s7
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s6, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
-; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3
; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_acc8:
@@ -432,25 +430,25 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT: flat_load_ubyte v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4
; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2
; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
@@ -489,11 +487,11 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -505,12 +503,12 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT: s_clause 0x2
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -523,12 +521,12 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: s_clause 0x2
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -578,19 +576,18 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[4:7], 0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s6, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -604,22 +601,22 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
+; GFX8-NEXT: flat_load_ubyte v2, v[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2
-; GFX8-NEXT: flat_store_byte v[0:1], v2
+; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
+; GFX8-NEXT: v_mad_u16 v0, v3, v6, v0
+; GFX8-NEXT: flat_store_byte v[4:5], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot2_8:
@@ -647,19 +644,19 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0
+; GFX9-DL-NEXT: v_perm_b32 v0, v2, v2, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0
+; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3
-; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v4
+; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_8:
@@ -668,19 +665,19 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: s_clause 0x2
; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX10-DL-NEXT: v_perm_b32 v0, v2, v2, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100
+; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3
-; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v4
+; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot2_8:
@@ -688,23 +685,22 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_u8 v3, v2, s[4:5]
+; GFX11-DL-NEXT: s_clause 0x2
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX11-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v2, v0, s[4:5]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -733,36 +729,35 @@ entry:
define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_CommutationInsideMAD:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s7
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s6, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1
-; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v1, v3
; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_CommutationInsideMAD:
@@ -774,25 +769,25 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT: flat_load_ubyte v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4
+; GFX8-NEXT: v_mad_u16 v2, v2, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_mad_u16 v2, v8, v7, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4
; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2
; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
@@ -831,11 +826,11 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
+; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -847,12 +842,12 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT: s_clause 0x2
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -865,12 +860,12 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: s_clause 0x2
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -911,36 +906,35 @@ entry:
define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_CommutationAccrossMADs:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s7
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s6, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
-; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v3, v7, v4, v3
; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v1, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_CommutationAccrossMADs:
@@ -952,25 +946,25 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT: flat_load_ubyte v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v4, v8, v7, v4
-; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4
+; GFX8-NEXT: v_mad_u16 v3, v8, v7, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2
+; GFX8-NEXT: v_mad_u16 v2, v2, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4
; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2
; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
@@ -1009,11 +1003,11 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
+; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1025,12 +1019,12 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT: s_clause 0x2
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4
; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -1043,12 +1037,12 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: s_clause 0x2
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1099,20 +1093,20 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4
+; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s8
; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8
-; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
@@ -1131,28 +1125,28 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8
+; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s0
+; GFX8-NEXT: v_mad_u32_u24 v8, v2, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8
+; GFX8-NEXT: v_bfe_u32 v7, v1, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1306,25 +1300,25 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s4
-; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s8
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, s4, v3
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, s8, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1339,28 +1333,28 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8
+; GFX8-NEXT: v_bfe_u32 v7, v1, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v4
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1373,17 +1367,17 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8
+; GFX9-NODL-NEXT: v_bfe_u32 v5, v1, 8, 8
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_bfe_u32 v5, v2, 8, 8
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_bfe_u32 v4, v2, 8, 8
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0
; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2
@@ -1398,15 +1392,15 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_add_i32 s1, s0, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, s1, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -1419,14 +1413,14 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
; GFX10-DL-NEXT: s_add_i32 s0, s0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
@@ -1514,24 +1508,23 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
+; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8
; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3
; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
@@ -1549,31 +1542,31 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ushort v4, v[0:1]
+; GFX8-NEXT: flat_load_ushort v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
-; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8
-; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX8-NEXT: v_bfe_i32 v6, v4, 0, 8
; GFX8-NEXT: v_bfe_i32 v7, v2, 0, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4
+; GFX8-NEXT: v_mad_u16 v3, v8, v9, v3
+; GFX8-NEXT: v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4
+; GFX8-NEXT: v_mad_u16 v3, v6, v7, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u16 v3, v10, v5, v3
+; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1674,7 +1667,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-TRUE16-NEXT: s_clause 0x1
+; GFX11-DL-TRUE16-NEXT: s_clause 0x2
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v5, s[4:5]
@@ -1793,26 +1786,25 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
+; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
@@ -1970,7 +1962,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-TRUE16-NEXT: s_clause 0x1
+; GFX11-DL-TRUE16-NEXT: s_clause 0x2
; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v4, s[4:5]
@@ -2098,9 +2090,9 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
@@ -2113,7 +2105,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s8
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0
@@ -2129,27 +2121,27 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8
-; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v3
-; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0
-; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 8
+; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 8
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0
-; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2161,15 +2153,15 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
@@ -2182,12 +2174,12 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -2199,13 +2191,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -2219,11 +2211,11 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -2267,29 +2259,28 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0
-; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; GFX7-NEXT: v_bfe_u32 v2, v0, 8, 8
; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-NEXT: v_mad_u32_u24 v3, v5, v7, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v2, v4, v2, v3
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v0, v2
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -2449,7 +2440,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-TRUE16-NEXT: s_clause 0x1
+; GFX11-DL-TRUE16-NEXT: s_clause 0x2
; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5]
@@ -2554,36 +2545,35 @@ entry:
define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_acc8_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s7
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s6, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
-; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT: v_mad_u32_u24 v3, v5, v8, v3
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v6, v0
+; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_acc8_vecMul:
@@ -2595,27 +2585,27 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT: flat_load_ubyte v3, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX8-NEXT: v_mul_lo_u16_e32 v9, v5, v6
; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7
; GFX8-NEXT: v_or_b32_e32 v8, v8, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3
; GFX8-NEXT: v_add_u16_e32 v2, v2, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v7
; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2
@@ -2724,7 +2714,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-TRUE16-NEXT: s_clause 0x1
+; GFX11-DL-TRUE16-NEXT: s_clause 0x2
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5]
@@ -2768,7 +2758,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-FAKE16-NEXT: s_clause 0x1
+; GFX11-DL-FAKE16-NEXT: s_clause 0x2
; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-DL-FAKE16-NEXT: global_load_u8 v3, v2, s[4:5]
@@ -2843,9 +2833,9 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
@@ -2868,21 +2858,21 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
+; GFX8-NEXT: v_bfe_u32 v1, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -2894,13 +2884,13 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -3011,19 +3001,19 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
@@ -3039,24 +3029,24 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8
+; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_bfe_u32 v1, v1, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3196,19 +3186,19 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
@@ -3224,24 +3214,24 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
-; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0
-; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1
+; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v0
+; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_bfe_u32 v1, v1, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3411,25 +3401,25 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 8
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2
-; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 8
; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_mad_u32_u24 v3, v3, v5, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v4
+; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v2, v3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3552,21 +3542,21 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8
@@ -3581,36 +3571,36 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_3src:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v5
+; GFX8-NEXT: v_bfe_u32 v3, v5, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, v4, v1
+; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v2, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_u32_u24 v1, v4, v6, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v5, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3761,21 +3751,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
@@ -3787,33 +3777,33 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_3src_3ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v5
+; GFX8-NEXT: v_bfe_u32 v3, v5, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0
-; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, v4, v1
+; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v2, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3954,6 +3944,7 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dword s12, s[4:5], 0xf
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s10, 0
; GFX7-NEXT: s_mov_b32 s11, s7
@@ -3962,21 +3953,19 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; GFX7-NEXT: s_and_b32 s1, s12, 0xffff
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v3, s1, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, s1, v1
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v5, v1
@@ -4179,20 +4168,20 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -4207,24 +4196,24 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8
+; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0
+; GFX8-NEXT: v_bfe_u32 v1, v1, 16, 8
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -4368,11 +4357,11 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
@@ -4393,32 +4382,32 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_3src_3ele_src0:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT: v_bfe_u32 v3, v5, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_u32 v1, v4, 8, 8
-; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 8
+; GFX8-NEXT: v_bfe_u32 v1, v2, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v4, v1, v1, s0
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v4
+; GFX8-NEXT: v_bfe_u32 v2, v2, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -4558,32 +4547,31 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_4src:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s18, 0
-; GFX7-NEXT: s_mov_b32 s19, s3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11]
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[16:17], s[12:13]
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[16:17], s[14:15]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[12:13]
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[14:15]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s8
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v3
; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v4
; GFX7-NEXT: v_bfe_u32 v4, v4, 8, 8
@@ -4599,42 +4587,42 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_4src:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s11
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s13
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s12, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v6
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s10, v6
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s13
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s12, v6
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: flat_load_dword v3, v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v1, s15
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v6
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v7
+; GFX8-NEXT: v_bfe_u32 v4, v7, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2
+; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v4, s2
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v5
-; GFX8-NEXT: v_bfe_u32 v5, v5, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v6, v5, v1
+; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3
+; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8
+; GFX8-NEXT: v_mad_u32_u24 v1, v5, v2, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
+; GFX8-NEXT: v_mad_u32_u24 v1, v6, v3, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v7, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -4896,8 +4884,8 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NODL-NEXT: global_load_dword v3, v2, s[2:3]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
@@ -5109,24 +5097,24 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
+; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v2
-; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7
+; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v3
+; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 8
+; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -5471,24 +5459,24 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2
-; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8
-; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3
-; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
+; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX8-NEXT: v_bfe_u32 v7, v4, 8, 8
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2
+; GFX8-NEXT: v_mad_u32_u24 v3, v3, v5, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8
+; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v2, v3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -5895,6 +5883,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3
; GFX10-DL-NEXT: ; kill: killed $vgpr5
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX10-DL-NEXT: global_load_dword v0, v5, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -5915,6 +5904,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[0:1]
; GFX11-DL-NEXT: global_load_b32 v0, v4, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
@@ -6011,29 +6001,29 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_movk_i32 s0, 0xfc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v3
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2
-; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
-; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v2
-; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7
+; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v3
+; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2
+; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6
; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -6087,6 +6077,7 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v2, v1, s[2:3]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] offset:252
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -6107,6 +6098,7 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v1, s[2:3]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] offset:252
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
@@ -6171,17 +6163,17 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -6196,11 +6188,11 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
@@ -6239,17 +6231,17 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0500
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0100
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s1
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s1
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -6261,13 +6253,13 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0500
-; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc0c0500
+; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
@@ -6284,15 +6276,15 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0500
-; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX11-DL-NEXT: v_perm_b32 v1, v1, v0, 0xc0c0500
+; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index b9d3763e7def1..4201f9c238970 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -26,20 +26,20 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s8
; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
@@ -72,43 +72,43 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
-; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
-; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4
-; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4
-; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4
-; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
-; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 4
+; GFX8-NEXT: v_bfe_i32 v4, v0, 4, 4
+; GFX8-NEXT: v_bfe_i32 v5, v1, 4, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
-; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
-; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
-; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1
-; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1
-; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4
-; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0
+; GFX8-NEXT: v_bfe_i32 v6, v0, 8, 4
+; GFX8-NEXT: v_bfe_i32 v7, v1, 8, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2
+; GFX8-NEXT: v_bfe_i32 v8, v0, 12, 4
+; GFX8-NEXT: v_bfe_i32 v9, v1, 12, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v6, v7, v2
+; GFX8-NEXT: v_bfe_i32 v10, v0, 16, 4
+; GFX8-NEXT: v_bfe_i32 v11, v1, 16, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v8, v9, v2
+; GFX8-NEXT: v_bfe_i32 v12, v0, 20, 4
+; GFX8-NEXT: v_bfe_i32 v13, v1, 20, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v10, v11, v2
+; GFX8-NEXT: v_bfe_i32 v14, v0, 24, 4
+; GFX8-NEXT: v_bfe_i32 v15, v1, 24, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v12, v13, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v14, v15, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -172,8 +172,8 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
@@ -181,7 +181,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -199,13 +199,13 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
-; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v2, v1, s0
; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@@ -223,11 +223,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v0, v1, s0
; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -320,26 +320,25 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
+; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v10, v3
; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
@@ -690,7 +689,7 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
+; GFX10-DL-NOXNACK-NEXT: s_clause 0x2
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11]
; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
@@ -842,26 +841,25 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
+; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v10, v3
; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
@@ -1212,7 +1210,7 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
+; GFX10-DL-NOXNACK-NEXT: s_clause 0x2
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11]
; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1]
@@ -1366,21 +1364,21 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4
+; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s8
; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16
-; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
@@ -1414,44 +1412,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
-; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
-; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4
-; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4
-; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4
-; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s0
-; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16
-; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
-; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
-; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1
-; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1
-; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4
-; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4
-; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
+; GFX8-NEXT: v_mad_i32_i24 v16, v2, v3, s0
+; GFX8-NEXT: v_bfe_i32 v4, v0, 4, 4
+; GFX8-NEXT: v_bfe_i32 v5, v1, 4, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v16
+; GFX8-NEXT: v_bfe_i32 v6, v0, 8, 4
+; GFX8-NEXT: v_bfe_i32 v7, v1, 8, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2
+; GFX8-NEXT: v_bfe_i32 v8, v0, 12, 4
+; GFX8-NEXT: v_bfe_i32 v9, v1, 12, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v6, v7, v2
+; GFX8-NEXT: v_bfe_i32 v10, v0, 16, 4
+; GFX8-NEXT: v_bfe_i32 v11, v1, 16, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v8, v9, v2
+; GFX8-NEXT: v_bfe_i32 v12, v0, 20, 4
+; GFX8-NEXT: v_bfe_i32 v13, v1, 20, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v10, v11, v2
+; GFX8-NEXT: v_bfe_i32 v14, v0, 24, 4
+; GFX8-NEXT: v_bfe_i32 v15, v1, 24, 4
+; GFX8-NEXT: v_mad_i32_i24 v2, v12, v13, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v1
+; GFX8-NEXT: v_mad_i32_i24 v2, v14, v15, v2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1757,9 +1755,9 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
@@ -1781,7 +1779,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
+; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s8
; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0
@@ -1804,11 +1802,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
@@ -1903,8 +1901,8 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
@@ -1912,7 +1910,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1930,13 +1928,13 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
-; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v2, v1, s0
; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@@ -1954,11 +1952,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v0, v1, s0
; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -2015,18 +2013,18 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4
+; GFX7-NEXT: v_bfe_i32 v1, v2, 24, 4
; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4
; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
; GFX7-NEXT: v_ashrrev_i32_e32 v7, 28, v2
; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
@@ -2043,11 +2041,10 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_mad_u32_u24 v3, v6, v13, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
@@ -2057,12 +2054,12 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -2427,7 +2424,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
+; GFX10-DL-NOXNACK-NEXT: s_clause 0x2
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11]
; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
@@ -2560,18 +2557,18 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4
+; GFX7-NEXT: v_bfe_i32 v1, v2, 24, 4
; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4
; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4
; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2
@@ -2588,11 +2585,10 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_mad_u32_u24 v3, v7, v14, v3
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -2602,12 +2598,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -2910,7 +2906,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: s_clause 0x1
+; GFX10-DL-XNACK-NEXT: s_clause 0x2
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11]
; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1]
@@ -3011,7 +3007,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
+; GFX10-DL-NOXNACK-NEXT: s_clause 0x2
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11]
; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 50f0a39802270..00c49cdecefb6 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -24,9 +24,9 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
@@ -48,7 +48,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s8
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -71,11 +71,11 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
@@ -170,8 +170,8 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
@@ -179,7 +179,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -197,13 +197,13 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -296,14 +296,14 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
@@ -311,7 +311,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
@@ -320,15 +320,14 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -491,7 +490,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: s_clause 0x2
; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7]
@@ -614,14 +613,14 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
@@ -629,7 +628,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
@@ -638,15 +637,14 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -809,7 +807,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: s_clause 0x2
; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
@@ -932,14 +930,14 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
@@ -947,7 +945,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
@@ -956,15 +954,14 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -1131,7 +1128,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: s_clause 0x2
; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
@@ -1239,14 +1236,14 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
@@ -1254,7 +1251,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
@@ -1263,15 +1260,14 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -1438,7 +1434,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: s_clause 0x2
; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
@@ -1545,9 +1541,9 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
@@ -1569,7 +1565,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s4
+; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s8
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
; GFX7-NEXT: v_mad_u32_u24 v2, v8, v15, v16
; GFX7-NEXT: v_mad_u32_u24 v2, v7, v14, v2
@@ -1594,11 +1590,11 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
@@ -1883,9 +1879,9 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
@@ -1907,7 +1903,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s8
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -1930,11 +1926,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
@@ -2029,8 +2025,8 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
@@ -2038,7 +2034,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -2056,13 +2052,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -2120,14 +2116,14 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
@@ -2135,7 +2131,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
@@ -2144,15 +2140,14 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -2444,14 +2439,14 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
@@ -2459,7 +2454,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
@@ -2468,15 +2463,14 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -2697,7 +2691,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: s_clause 0x2
; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[6:7]
@@ -2807,14 +2801,14 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
@@ -2822,7 +2816,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
@@ -2831,15 +2825,14 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -3128,16 +3121,16 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 15, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 4, 4
+; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v9, 15, v0
-; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4
@@ -3151,7 +3144,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX7-NEXT: v_bfe_u32 v15, v0, 24, 4
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 28, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v9, v1, s4
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v1, s8
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v10, v3, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v11, v4, v0
@@ -3171,33 +3164,33 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v1, 15, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4
-; GFX8-NEXT: v_bfe_u32 v6, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v10, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v12, v3, 20, 4
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 15, v0
-; GFX8-NEXT: v_bfe_u32 v5, v0, 4, 4
-; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4
-; GFX8-NEXT: v_bfe_u32 v9, v0, 12, 4
-; GFX8-NEXT: v_bfe_u32 v11, v0, 16, 4
-; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4
-; GFX8-NEXT: v_bfe_u32 v14, v3, 24, 4
-; GFX8-NEXT: v_bfe_u32 v15, v0, 24, 4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 15, v1
+; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 4
+; GFX8-NEXT: v_bfe_u32 v5, v1, 4, 4
+; GFX8-NEXT: v_bfe_u32 v6, v0, 8, 4
+; GFX8-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX8-NEXT: v_bfe_u32 v8, v0, 12, 4
+; GFX8-NEXT: v_bfe_u32 v9, v1, 12, 4
+; GFX8-NEXT: v_bfe_u32 v10, v0, 16, 4
+; GFX8-NEXT: v_bfe_u32 v11, v1, 16, 4
+; GFX8-NEXT: v_bfe_u32 v12, v0, 20, 4
+; GFX8-NEXT: v_bfe_u32 v13, v1, 20, 4
+; GFX8-NEXT: v_bfe_u32 v14, v0, 24, 4
+; GFX8-NEXT: v_bfe_u32 v15, v1, 24, 4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s0
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, s0
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, v2
; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v9, v8, v0
@@ -3259,12 +3252,12 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -3276,13 +3269,13 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %v2addr,
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index ec80efc5f0362..4a8967db765c0 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -295,16 +295,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: v_mov_b32_e32 v2, s10
-; GFX8V4-NEXT: v_mov_b32_e32 v3, s11
; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
+; GFX8V4-NEXT: v_mov_b32_e32 v3, s11
; GFX8V4-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: s_endpgm
@@ -320,16 +320,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: v_mov_b32_e32 v2, s10
-; GFX8V5-NEXT: v_mov_b32_e32 v3, s11
; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
+; GFX8V5-NEXT: v_mov_b32_e32 v3, s11
; GFX8V5-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index f8770642cc006..241a0fd4bfdf3 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -55,6 +55,7 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX12-NEXT: s_sext_i32_i16 s13, s13
; GFX12-NEXT: s_add_co_u32 s12, s12, wobble at gotpcrel32@lo+8
; GFX12-NEXT: s_add_co_ci_u32 s13, s13, wobble at gotpcrel32@hi+16
+; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_u8 s14, s[4:5], 0x0
; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX12-NEXT: s_load_b64 s[6:7], s[12:13], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index e0dacb7a59a42..18bd7db458b70 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1593,6 +1593,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 1c298014e33e7..3b17e84ee3ee1 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -29,17 +29,16 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_cmp_eq_u32 s46, 0
; CHECK-NEXT: s_mov_b32 s49, s48
+; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0
; CHECK-NEXT: s_mov_b32 s50, s48
; CHECK-NEXT: s_cselect_b32 s51, 0, s1
-; CHECK-NEXT: s_cselect_b32 s55, 0, s35
+; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43
; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
; CHECK-NEXT: s_cselect_b32 s52, 0, s2
-; CHECK-NEXT: s_cselect_b32 s56, 0, s36
-; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43
-; CHECK-NEXT: v_mov_b32_e32 v4, s50
-; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0
; CHECK-NEXT: s_cselect_b32 s53, 0, s3
; CHECK-NEXT: s_cselect_b32 s54, 0, s34
+; CHECK-NEXT: s_cselect_b32 s55, 0, s35
+; CHECK-NEXT: s_cselect_b32 s56, 0, s36
; CHECK-NEXT: s_cselect_b32 s57, 0, s37
; CHECK-NEXT: s_cselect_b32 s58, 0, s38
; CHECK-NEXT: s_cselect_b32 s59, 0, s0
@@ -49,6 +48,9 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK-NEXT: s_cselect_b32 s63, 0, s42
; CHECK-NEXT: s_cselect_b32 vcc_hi, 0, s44
; CHECK-NEXT: s_mov_b32 s46, s48
+; CHECK-NEXT: v_mov_b32_e32 v4, s50
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: scratch_store_b32 off, v0, s47
; CHECK-NEXT: scratch_store_b32 off, v0, s51
; CHECK-NEXT: scratch_store_b32 off, v0, s52
; CHECK-NEXT: scratch_store_b32 off, v0, s53
@@ -56,7 +58,6 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK-NEXT: scratch_store_b32 off, v0, s55
; CHECK-NEXT: scratch_store_b64 off, v[0:1], s56
; CHECK-NEXT: scratch_store_b32 off, v0, s57
-; CHECK-NEXT: scratch_store_b32 off, v0, s47
; CHECK-NEXT: scratch_store_b96 off, v[2:4], s58
; CHECK-NEXT: scratch_store_b96 off, v[2:4], s59
; CHECK-NEXT: scratch_store_b32 off, v0, s60
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 04abb75c3f912..8b22b93cb4102 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -126,6 +126,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
; CHECK-NEXT: s_add_u32 s18, s18, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, use_module at gotpcrel32@hi+12
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -192,6 +193,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-NEXT: s_add_u32 s18, s18, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, use_module at gotpcrel32@hi+12
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -258,6 +260,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-NEXT: s_add_u32 s18, s18, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, use_module at gotpcrel32@hi+12
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1
@@ -324,6 +327,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
; CHECK-NEXT: s_add_u32 s18, s18, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, use_module at gotpcrel32@hi+12
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll
index ff65d5d96cb2c..46209c58fe27b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll
@@ -13,6 +13,7 @@ define amdgpu_ps <10 x float> @image_bvh8_intersect_ray(i64 %node_ptr, float %ra
; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX12-SDAG-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b96 v[10:11], v[16:18], off
; GFX12-SDAG-NEXT: global_store_b96 v[12:13], v[19:21], off
; GFX12-SDAG-NEXT: ; return to shader part epilog
@@ -25,6 +26,7 @@ define amdgpu_ps <10 x float> @image_bvh8_intersect_ray(i64 %node_ptr, float %ra
; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0
; GFX12-GISEL-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[14:16], v[17:19], v9], s[0:3]
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: global_store_b96 v[10:11], v[14:16], off
; GFX12-GISEL-NEXT: global_store_b96 v[12:13], v[17:19], off
; GFX12-GISEL-NEXT: ; return to shader part epilog
@@ -54,6 +56,7 @@ define amdgpu_ps <10 x float> @image_bvh8_intersect_ray_1(i64 %node_ptr, float %
; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 1
; GFX12-SDAG-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b96 v[10:11], v[16:18], off
; GFX12-SDAG-NEXT: global_store_b96 v[12:13], v[19:21], off
; GFX12-SDAG-NEXT: ; return to shader part epilog
@@ -66,6 +69,7 @@ define amdgpu_ps <10 x float> @image_bvh8_intersect_ray_1(i64 %node_ptr, float %
; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 1
; GFX12-GISEL-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[14:16], v[17:19], v9], s[0:3]
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: global_store_b96 v[10:11], v[14:16], off
; GFX12-GISEL-NEXT: global_store_b96 v[12:13], v[17:19], off
; GFX12-GISEL-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
index 4e61cb4831545..a43c0ee749847 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
@@ -74,6 +74,9 @@ define i32 @dead_i32(i1 %cond, i32 %x, ptr addrspace(1) %ptr1) #0 {
; ASM-GISEL-FAKE16-NEXT: s_wait_alu 0xfffe
; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; ASM-DAG: ; %bb.0: ; %entry
+; ASM-DAG: ; %bb.0: ; %entry
+; ASM-DAG: ; %bb.0: ; %entry
entry:
%dead = call i32 @llvm.amdgcn.dead.i32()
br i1 %cond, label %if.then, label %if.end
@@ -221,6 +224,9 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1)
; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14
; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16
; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; ASM-DAG: ; %bb.0: ; %entry
+; ASM-DAG: ; %bb.0: ; %entry
+; ASM-DAG: ; %bb.0: ; %entry
; ASM-GISEL-LABEL: dead_struct:
; ASM-GISEL: ; %bb.0: ; %entry
; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -294,10 +300,10 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i
; ASM-DAG-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v0
; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1
; ASM-DAG-NEXT: s_clause 0x4
-; ASM-DAG-NEXT: scratch_load_b32 v35, off, s32 offset:12
; ASM-DAG-NEXT: scratch_load_b32 v34, off, s32 offset:8
; ASM-DAG-NEXT: scratch_load_b32 v31, off, s32 offset:4
; ASM-DAG-NEXT: scratch_load_b32 v30, off, s32
+; ASM-DAG-NEXT: scratch_load_b32 v35, off, s32 offset:12
; ASM-DAG-NEXT: scratch_load_b32 v1, off, s32 offset:16
; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo
; ASM-DAG-NEXT: v_and_b32_e32 v33, 1, v33
@@ -515,6 +521,9 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i
; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; ASM-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0
; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; ASM-DAG: ; %bb.0: ; %entry
+; ASM-DAG: ; %bb.0: ; %entry
+; ASM-DAG: ; %bb.0: ; %entry
; ASM-GISEL-LABEL: dead_array:
; ASM-GISEL: ; %bb.0: ; %entry
; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
index 7e22d60cd710f..e8ca34fe20677 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
@@ -16,6 +16,7 @@ define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray(i64 %node_ptr, float
; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3]
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off
; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off
; GFX12-SDAG-NEXT: ; return to shader part epilog
@@ -28,6 +29,7 @@ define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray(i64 %node_ptr, float
; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0
; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3]
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off
; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off
; GFX12-GISEL-NEXT: ; return to shader part epilog
@@ -57,6 +59,7 @@ define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray_1(i64 %node_ptr, flo
; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 1
; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3]
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off
; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off
; GFX12-SDAG-NEXT: ; return to shader part epilog
@@ -69,6 +72,7 @@ define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray_1(i64 %node_ptr, flo
; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 1
; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3]
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off
; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off
; GFX12-GISEL-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index addb395eccf11..ca5a70f18a581 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -14,6 +14,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
+; SDAG-GFX11-TRUE16-NEXT: s_clause 0x1
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -27,6 +28,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
+; SDAG-GFX11-FAKE16-NEXT: s_clause 0x1
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -51,6 +53,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
; SDAG-GFX11-TRUE16: ; %bb.0: ; %entry
; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: s_clause 0x2
; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s1
; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s2
; SDAG-GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s3
@@ -66,6 +69,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
; SDAG-GFX11-FAKE16: ; %bb.0: ; %entry
; SDAG-GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: s_clause 0x2
; SDAG-GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s2
; SDAG-GFX11-FAKE16-NEXT: scratch_load_u16 v1, off, s3
; SDAG-GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 19e03486d122d..9c7fbf49f72a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -13,6 +13,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
+; SDAG-GFX11-TRUE16-NEXT: s_clause 0x1
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -26,6 +27,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
+; SDAG-GFX11-FAKE16-NEXT: s_clause 0x1
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -39,6 +41,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
+; GISEL-GFX11-TRUE16-NEXT: s_clause 0x1
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -52,6 +55,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
+; GISEL-GFX11-FAKE16-NEXT: s_clause 0x1
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -76,6 +80,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
; SDAG-GFX11-TRUE16: ; %bb.0: ; %entry
; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: s_clause 0x2
; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s1
; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s2
; SDAG-GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s3
@@ -91,6 +96,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
; SDAG-GFX11-FAKE16: ; %bb.0: ; %entry
; SDAG-GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: s_clause 0x2
; SDAG-GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s2
; SDAG-GFX11-FAKE16-NEXT: scratch_load_u16 v1, off, s3
; SDAG-GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s1
@@ -103,6 +109,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
; GISEL-GFX11-TRUE16: ; %bb.0: ; %entry
; GISEL-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: s_clause 0x2
; GISEL-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s1
; GISEL-GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s2
; GISEL-GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s3
@@ -118,6 +125,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
; GISEL-GFX11-FAKE16: ; %bb.0: ; %entry
; GISEL-GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: s_clause 0x2
; GISEL-GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s1
; GISEL-GFX11-FAKE16-NEXT: scratch_load_b32 v1, off, s2
; GISEL-GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 159592cab6a34..284f3bf585d4e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -11,6 +11,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
@@ -71,6 +72,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
index 4a735a727229b..7d2cfffde7d14 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
@@ -49,10 +49,7 @@ define amdgpu_kernel void @mad_f32_imm_b(
; GCN-LABEL: {{^}}mad_f32_imm_c:
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000
-; GCN: s_load_dword [[B:s[0-9]+]]
-; GCN: s_load_dword [[A:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
-; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}}
+; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @mad_f32_imm_c(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 4fa4b73456ecd..44b25d06876be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -422,6 +422,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0
; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0
; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0
+; GFX1013-NEXT: s_clause 0x1
; GFX1013-NEXT: flat_load_dword v0, v[2:3]
; GFX1013-NEXT: flat_load_dword v1, v[4:5]
; GFX1013-NEXT: v_mov_b32_e32 v2, 0
@@ -450,6 +451,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2
; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX1030-NEXT: s_clause 0x1
; GFX1030-NEXT: flat_load_dword v0, v[0:1]
; GFX1030-NEXT: flat_load_dword v1, v[2:3]
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
@@ -475,6 +477,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: flat_load_b32 v9, v[0:1]
; GFX11-NEXT: flat_load_b32 v10, v[2:3]
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000
@@ -503,6 +506,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: flat_load_b32 v9, v[0:1]
; GFX12-SDAG-NEXT: flat_load_b32 v10, v[2:3]
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000
@@ -540,6 +544,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1]
; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3]
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
@@ -583,6 +588,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0
; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0
; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0
+; GFX1013-NEXT: s_clause 0x1
; GFX1013-NEXT: flat_load_dword v0, v[2:3]
; GFX1013-NEXT: flat_load_dword v1, v[4:5]
; GFX1013-NEXT: v_mov_b32_e32 v2, 0
@@ -608,6 +614,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2
; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX1030-NEXT: s_clause 0x1
; GFX1030-NEXT: flat_load_dword v0, v[0:1]
; GFX1030-NEXT: flat_load_dword v1, v[2:3]
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
@@ -631,6 +638,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: flat_load_b32 v6, v[0:1]
; GFX11-NEXT: flat_load_b32 v7, v[2:3]
; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400
@@ -656,6 +664,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: flat_load_b32 v6, v[0:1]
; GFX12-SDAG-NEXT: flat_load_b32 v7, v[2:3]
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400
@@ -688,6 +697,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1]
; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3]
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 0fe371c1b51fe..130616ed030ef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -52,23 +52,24 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: s_load_dwordx2 s[18:19], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s8, s8, 8
+; GCN-NEXT: s_add_u32 s18, s8, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_addc_u32 s19, s9, 0
; GCN-NEXT: s_getpc_b64 s[14:15]
; GCN-NEXT: s_add_u32 s14, s14, function_lds_id at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s15, s15, function_lds_id at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[20:21], s[14:15], 0x0
+; GCN-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x0
+; GCN-NEXT: s_load_dwordx2 s[22:23], s[14:15], 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_or_b32_e32 v31, v0, v2
; GCN-NEXT: s_mov_b32 s15, 21
+; GCN-NEXT: s_mov_b64 s[8:9], s[18:19]
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s18
-; GCN-NEXT: v_mov_b32_e32 v1, s19
-; GCN-NEXT: s_swappc_b64 s[30:31], s[20:21]
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GCN-NEXT: s_endpgm
call void @function_lds_id(ptr addrspace(1) %out)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index dac54c9f85e96..a861757c901b2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -17,8 +17,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -114,8 +114,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -211,8 +211,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -308,8 +308,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -405,8 +405,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -502,8 +502,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -599,8 +599,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -696,8 +696,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -845,8 +845,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -1294,8 +1294,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -1442,8 +1442,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -5332,8 +5332,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -5429,8 +5429,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -5526,8 +5526,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -5670,8 +5670,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -5767,8 +5767,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
@@ -5966,8 +5966,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword a15, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
index 6e24717a2827d..23781a6585c2c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -49,6 +49,7 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: buffer_load_u8 v[4:5], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b8 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
@@ -60,6 +61,7 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: buffer_load_u8 v[4:5], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b8 v[0:1], v4, off
; GFX12-NEXT: global_store_b32 v[2:3], v5, off
; GFX12-NEXT: s_endpgm
@@ -113,6 +115,7 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: buffer_load_u16 v[4:5], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
@@ -124,6 +127,7 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b16 v[0:1], v4, off
; GFX12-NEXT: global_store_b32 v[2:3], v5, off
; GFX12-NEXT: s_endpgm
@@ -177,6 +181,7 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: buffer_load_u16 v[4:5], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
@@ -188,6 +193,7 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b16 v[0:1], v4, off
; GFX12-NEXT: global_store_b32 v[2:3], v5, off
; GFX12-NEXT: s_endpgm
@@ -241,6 +247,7 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: buffer_load_b32 v[4:5], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
@@ -252,6 +259,7 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: buffer_load_b32 v[4:5], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b32 v[0:1], v4, off
; GFX12-NEXT: global_store_b32 v[2:3], v5, off
; GFX12-NEXT: s_endpgm
@@ -325,6 +333,7 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_load_b64 v[4:6], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off
; GFX11-NEXT: global_store_b32 v[2:3], v6, off
; GFX11-NEXT: s_endpgm
@@ -336,6 +345,7 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off
; GFX12-NEXT: global_store_b32 v[2:3], v6, off
; GFX12-NEXT: s_endpgm
@@ -409,6 +419,7 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_load_b64 v[4:6], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off
; GFX11-NEXT: global_store_b32 v[2:3], v6, off
; GFX11-NEXT: s_endpgm
@@ -420,6 +431,7 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off
; GFX12-NEXT: global_store_b32 v[2:3], v6, off
; GFX12-NEXT: s_endpgm
@@ -498,6 +510,7 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: v_mov_b32_e32 v7, v4
; GFX11-NEXT: buffer_load_b96 v[4:7], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off
; GFX11-NEXT: global_store_b32 v[2:3], v7, off
; GFX11-NEXT: s_endpgm
@@ -510,6 +523,7 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off
; GFX12-NEXT: global_store_b32 v[2:3], v7, off
; GFX12-NEXT: s_endpgm
@@ -588,6 +602,7 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: v_mov_b32_e32 v7, v4
; GFX11-NEXT: buffer_load_b96 v[4:7], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off
; GFX11-NEXT: global_store_b32 v[2:3], v7, off
; GFX11-NEXT: s_endpgm
@@ -600,6 +615,7 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off
; GFX12-NEXT: global_store_b32 v[2:3], v7, off
; GFX12-NEXT: s_endpgm
@@ -665,6 +681,7 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: v_mov_b32_e32 v8, v4
; GFX11-NEXT: buffer_load_b128 v[4:8], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off
; GFX11-NEXT: global_store_b32 v[2:3], v8, off
; GFX11-NEXT: s_endpgm
@@ -677,6 +694,7 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
; GFX12-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
; GFX12-NEXT: global_store_b32 v[2:3], v8, off
; GFX12-NEXT: s_endpgm
@@ -742,6 +760,7 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: v_mov_b32_e32 v8, v4
; GFX11-NEXT: buffer_load_b128 v[4:8], off, s[0:3], 0 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off
; GFX11-NEXT: global_store_b32 v[2:3], v8, off
; GFX11-NEXT: s_endpgm
@@ -754,6 +773,7 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
; GFX12-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
; GFX12-NEXT: global_store_b32 v[2:3], v8, off
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
index 09abebd638611..d0a38ae1add7d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -51,11 +51,11 @@ define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
; CHECK-LABEL: test_softwqm1:
; CHECK: ; %bb.0: ; %main_body
; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v2, s1
-; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
-; CHECK-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_add_f32_e32 v1, v1, v2
+; CHECK-NEXT: v_add_f32_e32 v1, v2, v1
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
; CHECK-NEXT: v_add_f32_e32 v0, v1, v1
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
@@ -79,11 +79,11 @@ define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
; CHECK-NEXT: s_mov_b64 s[2:3], exec
; CHECK-NEXT: s_wqm_b64 exec, exec
; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v2, s1
-; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
-; CHECK-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_add_f32_e32 v1, v1, v2
+; CHECK-NEXT: v_add_f32_e32 v1, v2, v1
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: v_add_f32_e32 v1, v1, v1
; CHECK-NEXT: s_and_b64 exec, exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
index 60c04749c9b74..f441b0852ff61 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -49,6 +49,7 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: buffer_load_u8 v[4:5], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b8 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
@@ -60,6 +61,7 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: buffer_load_u8 v[4:5], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b8 v[0:1], v4, off
; GFX12-NEXT: global_store_b32 v[2:3], v5, off
; GFX12-NEXT: s_endpgm
@@ -113,6 +115,7 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
@@ -124,6 +127,7 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b16 v[0:1], v4, off
; GFX12-NEXT: global_store_b32 v[2:3], v5, off
; GFX12-NEXT: s_endpgm
@@ -177,6 +181,7 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
@@ -188,6 +193,7 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b16 v[0:1], v4, off
; GFX12-NEXT: global_store_b32 v[2:3], v5, off
; GFX12-NEXT: s_endpgm
@@ -241,6 +247,7 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: buffer_load_b32 v[4:5], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
@@ -252,6 +259,7 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add
; GFX12-NEXT: v_mov_b32_e32 v5, v4
; GFX12-NEXT: buffer_load_b32 v[4:5], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b32 v[0:1], v4, off
; GFX12-NEXT: global_store_b32 v[2:3], v5, off
; GFX12-NEXT: s_endpgm
@@ -325,6 +333,7 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off
; GFX11-NEXT: global_store_b32 v[2:3], v6, off
; GFX11-NEXT: s_endpgm
@@ -336,6 +345,7 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off
; GFX12-NEXT: global_store_b32 v[2:3], v6, off
; GFX12-NEXT: s_endpgm
@@ -409,6 +419,7 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off
; GFX11-NEXT: global_store_b32 v[2:3], v6, off
; GFX11-NEXT: s_endpgm
@@ -420,6 +431,7 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off
; GFX12-NEXT: global_store_b32 v[2:3], v6, off
; GFX12-NEXT: s_endpgm
@@ -498,6 +510,7 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX11-NEXT: v_mov_b32_e32 v7, v4
; GFX11-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off
; GFX11-NEXT: global_store_b32 v[2:3], v7, off
; GFX11-NEXT: s_endpgm
@@ -510,6 +523,7 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off
; GFX12-NEXT: global_store_b32 v[2:3], v7, off
; GFX12-NEXT: s_endpgm
@@ -588,6 +602,7 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX11-NEXT: v_mov_b32_e32 v7, v4
; GFX11-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off
; GFX11-NEXT: global_store_b32 v[2:3], v7, off
; GFX11-NEXT: s_endpgm
@@ -600,6 +615,7 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX12-NEXT: v_mov_b32_e32 v7, v4
; GFX12-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off
; GFX12-NEXT: global_store_b32 v[2:3], v7, off
; GFX12-NEXT: s_endpgm
@@ -665,6 +681,7 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX11-NEXT: v_mov_b32_e32 v8, v4
; GFX11-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off
; GFX11-NEXT: global_store_b32 v[2:3], v8, off
; GFX11-NEXT: s_endpgm
@@ -677,6 +694,7 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
; GFX12-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
; GFX12-NEXT: global_store_b32 v[2:3], v8, off
; GFX12-NEXT: s_endpgm
@@ -742,6 +760,7 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX11-NEXT: v_mov_b32_e32 v8, v4
; GFX11-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off
; GFX11-NEXT: global_store_b32 v[2:3], v8, off
; GFX11-NEXT: s_endpgm
@@ -754,6 +773,7 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a
; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
; GFX12-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
; GFX12-NEXT: global_store_b32 v[2:3], v8, off
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
index 947c838740d43..ab85230b9f861 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
@@ -10,6 +10,7 @@ define amdgpu_ps <3 x float> @gather_sample(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-LABEL: gather_sample:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -48,6 +49,7 @@ define amdgpu_ps <3 x float> @sample_gather(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-LABEL: sample_gather:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index 3874a456590dc..46d662277928c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -77,10 +77,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
; W32-NEXT: s_endpgm
@@ -102,10 +101,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
; W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
; W32-NEXT: s_endpgm
@@ -152,10 +150,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
; W32-NEXT: s_endpgm
@@ -177,10 +174,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
; W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
; W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 25adc25d71768..377be223dc442 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -69,6 +69,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[36:37], v[40:43], off
; W64-NEXT: global_store_b128 v[38:39], v[32:35], off
; W64-NEXT: s_endpgm
@@ -90,6 +91,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
; W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[36:37], v[40:43], off
; W64-NEXT: global_store_b128 v[38:39], v[32:35], off
; W64-NEXT: s_endpgm
@@ -132,6 +134,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[36:37], v[40:43], off
; W64-NEXT: global_store_b128 v[38:39], v[32:35], off
; W64-NEXT: s_endpgm
@@ -153,6 +156,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
; W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[36:37], v[40:43], off
; W64-NEXT: global_store_b128 v[38:39], v[32:35], off
; W64-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 544941b7fa0da..753146d67cea5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -409,32 +409,32 @@ define amdgpu_kernel void @fma_v2f16(
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
+; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; VI-NEXT: v_fma_f16 v3, v5, v4, v3
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_fma_f16 v0, v2, v1, v0
+; VI-NEXT: v_fma_f16 v0, v2, v0, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v3
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
@@ -485,15 +485,15 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s2, 0x40400000
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
@@ -505,9 +505,9 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_fma_f32 v2, v3, s2, v2
+; SI-NEXT: v_fma_f32 v2, v2, s2, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_fma_f32 v0, v1, s2, v0
+; SI-NEXT: v_fma_f32 v0, v0, s2, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -582,15 +582,15 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s2, 0x40400000
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
@@ -602,9 +602,9 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_fma_f32 v2, v3, s2, v2
+; SI-NEXT: v_fma_f32 v2, v2, s2, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_fma_f32 v0, v1, s2, v0
+; SI-NEXT: v_fma_f32 v0, v0, s2, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -679,15 +679,15 @@ define amdgpu_kernel void @fma_v2f16_imm_c(
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s2, 0x40400000
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
@@ -699,9 +699,9 @@ define amdgpu_kernel void @fma_v2f16_imm_c(
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_fma_f32 v2, v3, v2, s2
+; SI-NEXT: v_fma_f32 v2, v2, v3, s2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_fma_f32 v0, v1, v0, s2
+; SI-NEXT: v_fma_f32 v0, v0, v1, s2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -835,34 +835,34 @@ define amdgpu_kernel void @fma_v4f16(
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s14, s2
-; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
; VI-NEXT: s_mov_b32 s16, s8
; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_mov_b32 s8, s10
; VI-NEXT: s_mov_b32 s9, s11
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
-; VI-NEXT: s_mov_b32 s12, s6
-; VI-NEXT: s_mov_b32 s13, s7
-; VI-NEXT: s_mov_b32 s18, s2
-; VI-NEXT: s_mov_b32 s19, s3
-; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
-; VI-NEXT: v_fma_f16 v1, v5, v3, v1
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; VI-NEXT: v_fma_f16 v1, v5, v1, v3
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4
-; VI-NEXT: v_fma_f16 v0, v4, v2, v0
+; VI-NEXT: v_fma_f16 v0, v4, v0, v2
; VI-NEXT: v_fma_f16 v2, v8, v7, v6
; VI-NEXT: v_fma_f16 v3, v9, v5, v3
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 61991c8b409dd..f2d708a4696b1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -118,6 +118,7 @@ define amdgpu_kernel void @fmuladd_f16(
; GFX10-FLUSH-NEXT: s_mov_b32 s5, s11
; GFX10-FLUSH-NEXT: s_mov_b32 s16, s12
; GFX10-FLUSH-NEXT: s_mov_b32 s17, s13
+; GFX10-FLUSH-NEXT: s_clause 0x1
; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; GFX10-FLUSH-NEXT: s_mov_b32 s12, s14
@@ -152,6 +153,7 @@ define amdgpu_kernel void @fmuladd_f16(
; GFX10-DENORM-NEXT: s_mov_b32 s17, s13
; GFX10-DENORM-NEXT: s_mov_b32 s20, s14
; GFX10-DENORM-NEXT: s_mov_b32 s21, s15
+; GFX10-DENORM-NEXT: s_clause 0x2
; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0
@@ -176,6 +178,7 @@ define amdgpu_kernel void @fmuladd_f16(
; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s13, s3
; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s16, s4
; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x1
; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0
; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s12, s6
@@ -206,6 +209,7 @@ define amdgpu_kernel void @fmuladd_f16(
; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x1
; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0
; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s4, s6
@@ -241,6 +245,7 @@ define amdgpu_kernel void @fmuladd_f16(
; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s17, s5
; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s20, s6
; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-TRUE16-NEXT: s_clause 0x2
; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0
; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0
@@ -269,6 +274,7 @@ define amdgpu_kernel void @fmuladd_f16(
; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-FAKE16-NEXT: s_clause 0x2
; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0
; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0
@@ -807,28 +813,28 @@ define amdgpu_kernel void @fmuladd_v2f16(
; VI-FLUSH-NEXT: s_mov_b32 s14, s10
; VI-FLUSH-NEXT: s_mov_b32 s15, s11
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: s_mov_b32 s12, s2
-; VI-FLUSH-NEXT: s_mov_b32 s13, s3
; VI-FLUSH-NEXT: s_mov_b32 s16, s4
; VI-FLUSH-NEXT: s_mov_b32 s17, s5
-; VI-FLUSH-NEXT: s_mov_b32 s18, s10
-; VI-FLUSH-NEXT: s_mov_b32 s19, s11
; VI-FLUSH-NEXT: s_mov_b32 s4, s6
; VI-FLUSH-NEXT: s_mov_b32 s5, s7
; VI-FLUSH-NEXT: s_mov_b32 s6, s10
; VI-FLUSH-NEXT: s_mov_b32 s7, s11
-; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; VI-FLUSH-NEXT: s_mov_b32 s12, s2
+; VI-FLUSH-NEXT: s_mov_b32 s13, s3
+; VI-FLUSH-NEXT: s_mov_b32 s18, s10
+; VI-FLUSH-NEXT: s_mov_b32 s19, s11
+; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[12:15], 0
; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0
; VI-FLUSH-NEXT: s_mov_b32 s8, s0
; VI-FLUSH-NEXT: s_mov_b32 s9, s1
-; VI-FLUSH-NEXT: s_waitcnt vmcnt(1)
-; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-FLUSH-NEXT: s_waitcnt vmcnt(2)
+; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2
-; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3
+; VI-FLUSH-NEXT: v_mac_f16_e32 v0, v1, v2
+; VI-FLUSH-NEXT: v_or_b32_e32 v0, v0, v3
; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-FLUSH-NEXT: s_endpgm
;
@@ -838,32 +844,32 @@ define amdgpu_kernel void @fmuladd_v2f16(
; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
; VI-DENORM-NEXT: s_mov_b32 s10, -1
; VI-DENORM-NEXT: s_mov_b32 s14, s10
-; VI-DENORM-NEXT: s_mov_b32 s15, s11
+; VI-DENORM-NEXT: s_mov_b32 s18, s10
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT: s_mov_b32 s12, s2
+; VI-DENORM-NEXT: s_mov_b32 s13, s3
; VI-DENORM-NEXT: s_mov_b32 s16, s4
; VI-DENORM-NEXT: s_mov_b32 s17, s5
+; VI-DENORM-NEXT: s_mov_b32 s19, s11
+; VI-DENORM-NEXT: s_mov_b32 s15, s11
; VI-DENORM-NEXT: s_mov_b32 s4, s6
; VI-DENORM-NEXT: s_mov_b32 s5, s7
; VI-DENORM-NEXT: s_mov_b32 s6, s10
; VI-DENORM-NEXT: s_mov_b32 s7, s11
-; VI-DENORM-NEXT: s_mov_b32 s12, s2
-; VI-DENORM-NEXT: s_mov_b32 s13, s3
-; VI-DENORM-NEXT: s_mov_b32 s18, s10
-; VI-DENORM-NEXT: s_mov_b32 s19, s11
-; VI-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; VI-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; VI-DENORM-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; VI-DENORM-NEXT: buffer_load_dword v1, off, s[4:7], 0
; VI-DENORM-NEXT: buffer_load_dword v2, off, s[12:15], 0
; VI-DENORM-NEXT: s_mov_b32 s8, s0
; VI-DENORM-NEXT: s_mov_b32 s9, s1
; VI-DENORM-NEXT: s_waitcnt vmcnt(2)
-; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; VI-DENORM-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; VI-DENORM-NEXT: s_waitcnt vmcnt(1)
-; VI-DENORM-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; VI-DENORM-NEXT: v_fma_f16 v3, v5, v4, v3
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; VI-DENORM-NEXT: v_fma_f16 v0, v2, v1, v0
+; VI-DENORM-NEXT: v_fma_f16 v0, v2, v0, v1
; VI-DENORM-NEXT: v_or_b32_e32 v0, v0, v3
; VI-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-DENORM-NEXT: s_endpgm
@@ -882,6 +888,7 @@ define amdgpu_kernel void @fmuladd_v2f16(
; GFX10-FLUSH-NEXT: s_mov_b32 s5, s11
; GFX10-FLUSH-NEXT: s_mov_b32 s16, s12
; GFX10-FLUSH-NEXT: s_mov_b32 s17, s13
+; GFX10-FLUSH-NEXT: s_clause 0x1
; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0
; GFX10-FLUSH-NEXT: s_mov_b32 s12, s14
@@ -916,6 +923,7 @@ define amdgpu_kernel void @fmuladd_v2f16(
; GFX10-DENORM-NEXT: s_mov_b32 s17, s13
; GFX10-DENORM-NEXT: s_mov_b32 s20, s14
; GFX10-DENORM-NEXT: s_mov_b32 s21, s15
+; GFX10-DENORM-NEXT: s_clause 0x2
; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0
@@ -940,6 +948,7 @@ define amdgpu_kernel void @fmuladd_v2f16(
; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT: s_clause 0x1
; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
@@ -975,6 +984,7 @@ define amdgpu_kernel void @fmuladd_v2f16(
; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT: s_clause 0x2
; GFX11-DENORM-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-DENORM-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-DENORM-NEXT: buffer_load_b32 v2, off, s[20:23], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index 3344c73f9eb6f..f3d225204b0ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -2399,7 +2399,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84
; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:96
; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:92
-; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104
; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100
; GFX950-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse
@@ -2408,21 +2407,22 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse
-; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: s_waitcnt vmcnt(24)
; GFX950-NEXT: v_max_f64 v[58:59], v[2:3], v[36:37]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37]
; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:112
; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:108
-; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: s_waitcnt vmcnt(24)
; GFX950-NEXT: v_max_f64 v[60:61], v[4:5], v[38:39]
; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39]
; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:120
; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116
-; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: s_waitcnt vmcnt(24)
; GFX950-NEXT: v_max_f64 v[62:63], v[6:7], v[48:49]
; GFX950-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49]
; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:128
; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:124
+; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: s_waitcnt vmcnt(25)
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[56:57]
; GFX950-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57]
@@ -2477,7 +2477,7 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35]
-; GFX950-NEXT: s_waitcnt vmcnt(6)
+; GFX950-NEXT: s_waitcnt vmcnt(7)
; GFX950-NEXT: v_max_f64 v[34:35], v[24:25], v[32:33]
; GFX950-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse
; GFX950-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc
@@ -2497,13 +2497,13 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse
-; GFX950-NEXT: s_waitcnt vmcnt(4)
+; GFX950-NEXT: s_waitcnt vmcnt(5)
; GFX950-NEXT: v_max_f64 v[32:33], v[26:27], v[36:37]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc
-; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: s_waitcnt vmcnt(3)
; GFX950-NEXT: v_max_f64 v[32:33], v[28:29], v[38:39]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39]
; GFX950-NEXT: s_nop 1
@@ -2642,7 +2642,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16
@@ -2675,49 +2674,50 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:124
-; GFX11-NEXT: s_waitcnt vmcnt(30)
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_waitcnt vmcnt(31)
; GFX11-NEXT: v_max_f64 v[96:97], v[0:1], v[32:33]
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33]
-; GFX11-NEXT: s_waitcnt vmcnt(28)
+; GFX11-NEXT: s_waitcnt vmcnt(29)
; GFX11-NEXT: v_max_f64 v[32:33], v[2:3], v[34:35]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[34:35]
-; GFX11-NEXT: s_waitcnt vmcnt(26)
+; GFX11-NEXT: s_waitcnt vmcnt(27)
; GFX11-NEXT: v_max_f64 v[34:35], v[4:5], v[36:37]
; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[36:37]
-; GFX11-NEXT: s_waitcnt vmcnt(24)
+; GFX11-NEXT: s_waitcnt vmcnt(25)
; GFX11-NEXT: v_max_f64 v[36:37], v[6:7], v[38:39]
; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[38:39]
-; GFX11-NEXT: s_waitcnt vmcnt(22)
+; GFX11-NEXT: s_waitcnt vmcnt(23)
; GFX11-NEXT: v_max_f64 v[38:39], v[8:9], v[48:49]
; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[48:49]
-; GFX11-NEXT: s_waitcnt vmcnt(20)
+; GFX11-NEXT: s_waitcnt vmcnt(21)
; GFX11-NEXT: v_max_f64 v[48:49], v[10:11], v[50:51]
; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[50:51]
-; GFX11-NEXT: s_waitcnt vmcnt(18)
+; GFX11-NEXT: s_waitcnt vmcnt(19)
; GFX11-NEXT: v_max_f64 v[50:51], v[12:13], v[52:53]
; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[52:53]
-; GFX11-NEXT: s_waitcnt vmcnt(16)
+; GFX11-NEXT: s_waitcnt vmcnt(17)
; GFX11-NEXT: v_max_f64 v[52:53], v[14:15], v[54:55]
; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[54:55]
-; GFX11-NEXT: s_waitcnt vmcnt(14)
+; GFX11-NEXT: s_waitcnt vmcnt(15)
; GFX11-NEXT: v_max_f64 v[54:55], v[16:17], v[64:65]
; GFX11-NEXT: v_cmp_u_f64_e64 s7, v[16:17], v[64:65]
-; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: s_waitcnt vmcnt(13)
; GFX11-NEXT: v_max_f64 v[64:65], v[18:19], v[66:67]
; GFX11-NEXT: v_cmp_u_f64_e64 s8, v[18:19], v[66:67]
-; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: s_waitcnt vmcnt(11)
; GFX11-NEXT: v_max_f64 v[66:67], v[20:21], v[68:69]
; GFX11-NEXT: v_cmp_u_f64_e64 s9, v[20:21], v[68:69]
-; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_max_f64 v[68:69], v[22:23], v[70:71]
; GFX11-NEXT: v_cmp_u_f64_e64 s10, v[22:23], v[70:71]
-; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: v_max_f64 v[70:71], v[24:25], v[80:81]
; GFX11-NEXT: v_cmp_u_f64_e64 s11, v[24:25], v[80:81]
-; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_max_f64 v[80:81], v[26:27], v[82:83]
; GFX11-NEXT: v_cmp_u_f64_e64 s12, v[26:27], v[82:83]
-; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_max_f64 v[82:83], v[28:29], v[84:85]
; GFX11-NEXT: v_cmp_u_f64_e64 s13, v[28:29], v[84:85]
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2765,7 +2765,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1f
-; GFX12-NEXT: scratch_load_b32 v31, off, s32
; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:16
@@ -2798,35 +2797,36 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: scratch_load_b32 v84, off, s32 offset:116
; GFX12-NEXT: scratch_load_b32 v87, off, s32 offset:128
; GFX12-NEXT: scratch_load_b32 v86, off, s32 offset:124
-; GFX12-NEXT: s_wait_loadcnt 0x1e
+; GFX12-NEXT: scratch_load_b32 v31, off, s32
+; GFX12-NEXT: s_wait_loadcnt 0x1f
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[32:33]
-; GFX12-NEXT: s_wait_loadcnt 0x1c
+; GFX12-NEXT: s_wait_loadcnt 0x1d
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[34:35]
-; GFX12-NEXT: s_wait_loadcnt 0x1a
+; GFX12-NEXT: s_wait_loadcnt 0x1b
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[36:37]
-; GFX12-NEXT: s_wait_loadcnt 0x18
+; GFX12-NEXT: s_wait_loadcnt 0x19
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[38:39]
-; GFX12-NEXT: s_wait_loadcnt 0x16
+; GFX12-NEXT: s_wait_loadcnt 0x17
; GFX12-NEXT: v_maximum_f64 v[8:9], v[8:9], v[48:49]
-; GFX12-NEXT: s_wait_loadcnt 0x14
+; GFX12-NEXT: s_wait_loadcnt 0x15
; GFX12-NEXT: v_maximum_f64 v[10:11], v[10:11], v[50:51]
-; GFX12-NEXT: s_wait_loadcnt 0x12
+; GFX12-NEXT: s_wait_loadcnt 0x13
; GFX12-NEXT: v_maximum_f64 v[12:13], v[12:13], v[52:53]
-; GFX12-NEXT: s_wait_loadcnt 0x10
+; GFX12-NEXT: s_wait_loadcnt 0x11
; GFX12-NEXT: v_maximum_f64 v[14:15], v[14:15], v[54:55]
-; GFX12-NEXT: s_wait_loadcnt 0xe
+; GFX12-NEXT: s_wait_loadcnt 0xf
; GFX12-NEXT: v_maximum_f64 v[16:17], v[16:17], v[64:65]
-; GFX12-NEXT: s_wait_loadcnt 0xc
+; GFX12-NEXT: s_wait_loadcnt 0xd
; GFX12-NEXT: v_maximum_f64 v[18:19], v[18:19], v[66:67]
-; GFX12-NEXT: s_wait_loadcnt 0xa
+; GFX12-NEXT: s_wait_loadcnt 0xb
; GFX12-NEXT: v_maximum_f64 v[20:21], v[20:21], v[68:69]
-; GFX12-NEXT: s_wait_loadcnt 0x8
+; GFX12-NEXT: s_wait_loadcnt 0x9
; GFX12-NEXT: v_maximum_f64 v[22:23], v[22:23], v[70:71]
-; GFX12-NEXT: s_wait_loadcnt 0x6
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: v_maximum_f64 v[24:25], v[24:25], v[80:81]
-; GFX12-NEXT: s_wait_loadcnt 0x4
+; GFX12-NEXT: s_wait_loadcnt 0x5
; GFX12-NEXT: v_maximum_f64 v[26:27], v[26:27], v[82:83]
-; GFX12-NEXT: s_wait_loadcnt 0x2
+; GFX12-NEXT: s_wait_loadcnt 0x3
; GFX12-NEXT: v_maximum_f64 v[28:29], v[28:29], v[84:85]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[30:31], v[30:31], v[86:87]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 863240cc591c3..524c4a557cc04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -458,44 +458,40 @@ define amdgpu_kernel void @maxnum_v2f16(
;
; VI-LABEL: maxnum_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s8, s[8:9], 0x0
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s6, s[2:3], 0x0
+; VI-NEXT: s_load_dword s4, s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0
-; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s5, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
-; GFX9-NEXT: v_pk_max_f16 v1, s11, s11
+; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v2f16:
@@ -504,6 +500,7 @@ define amdgpu_kernel void @maxnum_v2f16(
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
@@ -521,6 +518,7 @@ define amdgpu_kernel void @maxnum_v2f16(
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
@@ -753,52 +751,48 @@ define amdgpu_kernel void @maxnum_v3f16(
;
; VI-LABEL: maxnum_v3f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: v_max_f16_e64 v2, s7, s7
; VI-NEXT: v_max_f16_e32 v1, v2, v1
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v3f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
-; GFX9-NEXT: v_pk_max_f16 v1, s12, s12
-; GFX9-NEXT: v_pk_max_f16 v2, s11, s11
+; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s8, s8
+; GFX9-NEXT: v_pk_max_f16 v2, s9, s9
; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
-; GFX9-NEXT: v_pk_max_f16 v1, s13, s13
+; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
-; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v3f16:
@@ -807,6 +801,7 @@ define amdgpu_kernel void @maxnum_v3f16(
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
@@ -828,6 +823,7 @@ define amdgpu_kernel void @maxnum_v3f16(
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -897,56 +893,52 @@ define amdgpu_kernel void @maxnum_v4f16(
;
; VI-LABEL: maxnum_v4f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: s_lshr_b32 s0, s9, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v0, s5, s5
+; VI-NEXT: s_lshr_b32 s5, s5, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: s_lshr_b32 s5, s7, 16
+; VI-NEXT: v_max_f16_e64 v2, s5, s5
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s4, 16
; VI-NEXT: v_max_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v4f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
-; GFX9-NEXT: v_pk_max_f16 v1, s13, s13
-; GFX9-NEXT: v_pk_max_f16 v2, s10, s10
+; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v0, s9, s9
+; GFX9-NEXT: v_pk_max_f16 v2, s8, s8
; GFX9-NEXT: v_pk_max_f16 v1, v1, v0
-; GFX9-NEXT: v_pk_max_f16 v0, s12, s12
+; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v4f16:
@@ -955,6 +947,7 @@ define amdgpu_kernel void @maxnum_v4f16(
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
@@ -975,6 +968,7 @@ define amdgpu_kernel void @maxnum_v4f16(
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 1d1673315f6ff..73059dbb3f752 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -2399,7 +2399,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84
; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:96
; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:92
-; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104
; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100
; GFX950-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse
@@ -2408,21 +2407,22 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse
-; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: s_waitcnt vmcnt(24)
; GFX950-NEXT: v_min_f64 v[58:59], v[2:3], v[36:37]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37]
; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:112
; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:108
-; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: s_waitcnt vmcnt(24)
; GFX950-NEXT: v_min_f64 v[60:61], v[4:5], v[38:39]
; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39]
; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:120
; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116
-; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: s_waitcnt vmcnt(24)
; GFX950-NEXT: v_min_f64 v[62:63], v[6:7], v[48:49]
; GFX950-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49]
; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:128
; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:124
+; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: s_waitcnt vmcnt(25)
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[56:57]
; GFX950-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57]
@@ -2477,7 +2477,7 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35]
-; GFX950-NEXT: s_waitcnt vmcnt(6)
+; GFX950-NEXT: s_waitcnt vmcnt(7)
; GFX950-NEXT: v_min_f64 v[34:35], v[24:25], v[32:33]
; GFX950-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse
; GFX950-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc
@@ -2497,13 +2497,13 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse
-; GFX950-NEXT: s_waitcnt vmcnt(4)
+; GFX950-NEXT: s_waitcnt vmcnt(5)
; GFX950-NEXT: v_min_f64 v[32:33], v[26:27], v[36:37]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc
-; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: s_waitcnt vmcnt(3)
; GFX950-NEXT: v_min_f64 v[32:33], v[28:29], v[38:39]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39]
; GFX950-NEXT: s_nop 1
@@ -2642,7 +2642,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16
@@ -2675,49 +2674,50 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:124
-; GFX11-NEXT: s_waitcnt vmcnt(30)
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_waitcnt vmcnt(31)
; GFX11-NEXT: v_min_f64 v[96:97], v[0:1], v[32:33]
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33]
-; GFX11-NEXT: s_waitcnt vmcnt(28)
+; GFX11-NEXT: s_waitcnt vmcnt(29)
; GFX11-NEXT: v_min_f64 v[32:33], v[2:3], v[34:35]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[34:35]
-; GFX11-NEXT: s_waitcnt vmcnt(26)
+; GFX11-NEXT: s_waitcnt vmcnt(27)
; GFX11-NEXT: v_min_f64 v[34:35], v[4:5], v[36:37]
; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[36:37]
-; GFX11-NEXT: s_waitcnt vmcnt(24)
+; GFX11-NEXT: s_waitcnt vmcnt(25)
; GFX11-NEXT: v_min_f64 v[36:37], v[6:7], v[38:39]
; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[38:39]
-; GFX11-NEXT: s_waitcnt vmcnt(22)
+; GFX11-NEXT: s_waitcnt vmcnt(23)
; GFX11-NEXT: v_min_f64 v[38:39], v[8:9], v[48:49]
; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[48:49]
-; GFX11-NEXT: s_waitcnt vmcnt(20)
+; GFX11-NEXT: s_waitcnt vmcnt(21)
; GFX11-NEXT: v_min_f64 v[48:49], v[10:11], v[50:51]
; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[50:51]
-; GFX11-NEXT: s_waitcnt vmcnt(18)
+; GFX11-NEXT: s_waitcnt vmcnt(19)
; GFX11-NEXT: v_min_f64 v[50:51], v[12:13], v[52:53]
; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[52:53]
-; GFX11-NEXT: s_waitcnt vmcnt(16)
+; GFX11-NEXT: s_waitcnt vmcnt(17)
; GFX11-NEXT: v_min_f64 v[52:53], v[14:15], v[54:55]
; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[54:55]
-; GFX11-NEXT: s_waitcnt vmcnt(14)
+; GFX11-NEXT: s_waitcnt vmcnt(15)
; GFX11-NEXT: v_min_f64 v[54:55], v[16:17], v[64:65]
; GFX11-NEXT: v_cmp_u_f64_e64 s7, v[16:17], v[64:65]
-; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: s_waitcnt vmcnt(13)
; GFX11-NEXT: v_min_f64 v[64:65], v[18:19], v[66:67]
; GFX11-NEXT: v_cmp_u_f64_e64 s8, v[18:19], v[66:67]
-; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: s_waitcnt vmcnt(11)
; GFX11-NEXT: v_min_f64 v[66:67], v[20:21], v[68:69]
; GFX11-NEXT: v_cmp_u_f64_e64 s9, v[20:21], v[68:69]
-; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_min_f64 v[68:69], v[22:23], v[70:71]
; GFX11-NEXT: v_cmp_u_f64_e64 s10, v[22:23], v[70:71]
-; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: v_min_f64 v[70:71], v[24:25], v[80:81]
; GFX11-NEXT: v_cmp_u_f64_e64 s11, v[24:25], v[80:81]
-; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_min_f64 v[80:81], v[26:27], v[82:83]
; GFX11-NEXT: v_cmp_u_f64_e64 s12, v[26:27], v[82:83]
-; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_min_f64 v[82:83], v[28:29], v[84:85]
; GFX11-NEXT: v_cmp_u_f64_e64 s13, v[28:29], v[84:85]
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2765,7 +2765,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1f
-; GFX12-NEXT: scratch_load_b32 v31, off, s32
; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:16
@@ -2798,35 +2797,36 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: scratch_load_b32 v84, off, s32 offset:116
; GFX12-NEXT: scratch_load_b32 v87, off, s32 offset:128
; GFX12-NEXT: scratch_load_b32 v86, off, s32 offset:124
-; GFX12-NEXT: s_wait_loadcnt 0x1e
+; GFX12-NEXT: scratch_load_b32 v31, off, s32
+; GFX12-NEXT: s_wait_loadcnt 0x1f
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[32:33]
-; GFX12-NEXT: s_wait_loadcnt 0x1c
+; GFX12-NEXT: s_wait_loadcnt 0x1d
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[34:35]
-; GFX12-NEXT: s_wait_loadcnt 0x1a
+; GFX12-NEXT: s_wait_loadcnt 0x1b
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[36:37]
-; GFX12-NEXT: s_wait_loadcnt 0x18
+; GFX12-NEXT: s_wait_loadcnt 0x19
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[38:39]
-; GFX12-NEXT: s_wait_loadcnt 0x16
+; GFX12-NEXT: s_wait_loadcnt 0x17
; GFX12-NEXT: v_minimum_f64 v[8:9], v[8:9], v[48:49]
-; GFX12-NEXT: s_wait_loadcnt 0x14
+; GFX12-NEXT: s_wait_loadcnt 0x15
; GFX12-NEXT: v_minimum_f64 v[10:11], v[10:11], v[50:51]
-; GFX12-NEXT: s_wait_loadcnt 0x12
+; GFX12-NEXT: s_wait_loadcnt 0x13
; GFX12-NEXT: v_minimum_f64 v[12:13], v[12:13], v[52:53]
-; GFX12-NEXT: s_wait_loadcnt 0x10
+; GFX12-NEXT: s_wait_loadcnt 0x11
; GFX12-NEXT: v_minimum_f64 v[14:15], v[14:15], v[54:55]
-; GFX12-NEXT: s_wait_loadcnt 0xe
+; GFX12-NEXT: s_wait_loadcnt 0xf
; GFX12-NEXT: v_minimum_f64 v[16:17], v[16:17], v[64:65]
-; GFX12-NEXT: s_wait_loadcnt 0xc
+; GFX12-NEXT: s_wait_loadcnt 0xd
; GFX12-NEXT: v_minimum_f64 v[18:19], v[18:19], v[66:67]
-; GFX12-NEXT: s_wait_loadcnt 0xa
+; GFX12-NEXT: s_wait_loadcnt 0xb
; GFX12-NEXT: v_minimum_f64 v[20:21], v[20:21], v[68:69]
-; GFX12-NEXT: s_wait_loadcnt 0x8
+; GFX12-NEXT: s_wait_loadcnt 0x9
; GFX12-NEXT: v_minimum_f64 v[22:23], v[22:23], v[70:71]
-; GFX12-NEXT: s_wait_loadcnt 0x6
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: v_minimum_f64 v[24:25], v[24:25], v[80:81]
-; GFX12-NEXT: s_wait_loadcnt 0x4
+; GFX12-NEXT: s_wait_loadcnt 0x5
; GFX12-NEXT: v_minimum_f64 v[26:27], v[26:27], v[82:83]
-; GFX12-NEXT: s_wait_loadcnt 0x2
+; GFX12-NEXT: s_wait_loadcnt 0x3
; GFX12-NEXT: v_minimum_f64 v[28:29], v[28:29], v[84:85]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[30:31], v[30:31], v[86:87]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 7e8c30161c1c8..ba80a37c1a9d8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -490,44 +490,40 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
;
; VI-LABEL: minnum_v2f16_ieee:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s8, s[8:9], 0x0
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s6, s[2:3], 0x0
+; VI-NEXT: s_load_dword s4, s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_ieee:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0
-; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s5, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
-; GFX9-NEXT: v_pk_max_f16 v1, s11, s11
+; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v2f16_ieee:
@@ -536,6 +532,7 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
@@ -553,6 +550,7 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
@@ -817,52 +815,48 @@ define amdgpu_kernel void @minnum_v3f16(
;
; VI-LABEL: minnum_v3f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: v_max_f16_e64 v2, s7, s7
; VI-NEXT: v_min_f16_e32 v1, v2, v1
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v3f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
-; GFX9-NEXT: v_pk_max_f16 v1, s12, s12
-; GFX9-NEXT: v_pk_max_f16 v2, s11, s11
+; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s8, s8
+; GFX9-NEXT: v_pk_max_f16 v2, s9, s9
; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
-; GFX9-NEXT: v_pk_max_f16 v1, s13, s13
+; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
; GFX9-NEXT: v_pk_min_f16 v1, v1, v2
-; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v3f16:
@@ -871,6 +865,7 @@ define amdgpu_kernel void @minnum_v3f16(
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
@@ -892,6 +887,7 @@ define amdgpu_kernel void @minnum_v3f16(
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -960,56 +956,52 @@ define amdgpu_kernel void @minnum_v4f16(
;
; VI-LABEL: minnum_v4f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: s_lshr_b32 s0, s9, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v0, s5, s5
+; VI-NEXT: s_lshr_b32 s5, s5, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s5, s5
+; VI-NEXT: s_lshr_b32 s5, s7, 16
+; VI-NEXT: v_max_f16_e64 v2, s5, s5
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
-; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s4, 16
; VI-NEXT: v_min_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v4f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
-; GFX9-NEXT: v_pk_max_f16 v1, s13, s13
-; GFX9-NEXT: v_pk_max_f16 v2, s10, s10
+; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v0, s9, s9
+; GFX9-NEXT: v_pk_max_f16 v2, s8, s8
; GFX9-NEXT: v_pk_min_f16 v1, v1, v0
-; GFX9-NEXT: v_pk_max_f16 v0, s12, s12
+; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v4f16:
@@ -1018,6 +1010,7 @@ define amdgpu_kernel void @minnum_v4f16(
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
@@ -1038,6 +1031,7 @@ define amdgpu_kernel void @minnum_v4f16(
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 9e518589ac5b3..0a9a41b2010a6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -20,12 +20,12 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_add_u32 s0, s0, 4
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_load_dword v0, v[0:1]
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: flat_load_dword v1, v[1:2]
+; GCN-NEXT: flat_load_dword v1, v[2:3]
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 1857eaba0a2a9..f4b03512c8915 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -114,13 +114,13 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v7, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v4, v[4:5]
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v7, v[2:3]
-; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: flat_load_ushort v8, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6
@@ -128,11 +128,11 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_max_i16_e32 v6, v5, v7
-; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v5, v6, v5
+; VI-NEXT: v_max_i16_e32 v6, v7, v5
+; VI-NEXT: v_max_i16_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_i16_e32 v4, v4, v8
+; VI-NEXT: v_max_i16_e32 v4, v8, v4
+; VI-NEXT: v_or_b32_e32 v5, v6, v5
; VI-NEXT: flat_store_short v[2:3], v4
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 05ffaf62ff1e0..c68bc647d1f91 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -2613,6 +2613,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2631,6 +2632,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
index 0f67a404972aa..ab38e91af8bb4 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
@@ -10,11 +10,13 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
; GFX11-NEXT: s_mov_b32 s0, s3
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0xbc00bc00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0
; GFX11-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -48,10 +50,12 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
; GFX12-NEXT: v_mov_b32_e32 v0, 0xbc00bc00
; GFX12-NEXT: s_mov_b32 s3, s5
; GFX12-NEXT: s_mov_b32 s1, s5
+; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_clause 0x4
; GFX12-NEXT: buffer_load_b32 v1, off, s[20:23], null
; GFX12-NEXT: buffer_load_b32 v2, off, s[16:19], null
; GFX12-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -85,11 +89,12 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0xbc00bc00
; GFX12-GISEL-NEXT: s_mov_b32 s1, s21
; GFX12-GISEL-NEXT: s_mov_b32 s3, s21
+; GFX12-GISEL-NEXT: s_clause 0x2
; GFX12-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
-; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_clause 0x4
; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null
; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 0f47a31f52dcb..cf4813772530a 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -500,6 +500,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s12, s2
; GFX10-NEXT: s_mov_b32 s13, s3
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0
; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0
; GFX10-NEXT: s_mov_b32 s4, s0
@@ -523,6 +524,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s2
; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0
; GFX11-NEXT: s_mov_b32 s8, s0
@@ -546,6 +548,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s12, s2
; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null
; GFX12-NEXT: buffer_load_b32 v1, off, s[4:7], null
; GFX12-NEXT: s_mov_b32 s8, s0
@@ -1884,24 +1887,24 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_lo_u32 v1, v2, v1
-; SI-NEXT: v_mul_hi_u32 v4, v2, v0
-; SI-NEXT: v_mul_lo_u32 v3, v3, v0
-; SI-NEXT: v_mul_lo_u32 v0, v2, v0
-; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; SI-NEXT: v_mul_lo_u32 v3, v0, v3
+; SI-NEXT: v_mul_hi_u32 v4, v0, v2
+; SI-NEXT: v_mul_lo_u32 v1, v1, v2
+; SI-NEXT: v_mul_lo_u32 v0, v0, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, v3, v4
+; SI-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -1911,24 +1914,24 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_mov_b32 s15, s7
-; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_lo_u32 v4, v2, v1
-; VI-NEXT: v_mad_u64_u32 v[1:2], s[2:3], v2, v0, 0
-; VI-NEXT: v_mul_lo_u32 v0, v3, v0
-; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0
+; VI-NEXT: v_mul_lo_u32 v5, v0, v3
+; VI-NEXT: v_mad_u64_u32 v[3:4], s[2:3], v0, v2, 0
+; VI-NEXT: v_mul_lo_u32 v0, v1, v2
+; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, v1, v0
+; VI-NEXT: buffer_store_dwordx2 v[3:4], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_i64:
@@ -1937,24 +1940,24 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_mov_b32 s14, s6
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s14, s6
; GFX9-NEXT: s_mov_b32 s15, s7
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; GFX9-NEXT: s_mov_b32 s10, s6
+; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0
-; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0
-; GFX9-NEXT: v_add_u32_e32 v1, v4, v1
-; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT: v_mul_lo_u32 v3, v0, v3
+; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v4, v3
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
@@ -1965,24 +1968,25 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
; GFX10-NEXT: s_mov_b32 s14, s6
; GFX10-NEXT: s_mov_b32 s15, s7
+; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s11, s7
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s12, s2
; GFX10-NEXT: s_mov_b32 s13, s3
-; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; GFX10-NEXT: s_mov_b32 s4, s0
; GFX10-NEXT: s_mov_b32 s5, s1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1
-; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0
-; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0
-; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3
+; GFX10-NEXT: v_mul_hi_u32 v4, v0, v2
+; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
+; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
@@ -1993,25 +1997,26 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: s_mov_b32 s10, -1
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, s10
-; GFX11-NEXT: s_mov_b32 s7, s11
; GFX11-NEXT: s_mov_b32 s14, s10
; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s6, s10
+; GFX11-NEXT: s_mov_b32 s7, s11
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s2
; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0
+; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[4:7], 0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1
-; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0
-; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0
-; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0
+; GFX11-NEXT: v_mul_lo_u32 v3, v0, v3
+; GFX11-NEXT: v_mul_hi_u32 v4, v0, v2
+; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: s_endpgm
;
@@ -2022,24 +2027,25 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_mov_b32 s10, -1
; GFX12-NEXT: s_mov_b32 s11, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, s10
-; GFX12-NEXT: s_mov_b32 s7, s11
; GFX12-NEXT: s_mov_b32 s14, s10
; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s6, s10
+; GFX12-NEXT: s_mov_b32 s7, s11
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s12, s2
; GFX12-NEXT: s_mov_b32 s13, s3
-; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null
; GFX12-NEXT: s_mov_b32 s8, s0
; GFX12-NEXT: s_mov_b32 s9, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
-; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2
-; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX12-NEXT: v_mul_lo_u32 v1, v2, v1
+; GFX12-NEXT: v_mul_lo_u32 v3, v3, v0
+; GFX12-NEXT: v_mul_hi_u32 v4, v2, v0
+; GFX12-NEXT: v_mul_lo_u32 v0, v2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index cc9650b9a7309..a40a766a6169a 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -734,20 +734,20 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: s_mov_b32 s10, s6
-; GFX6-NEXT: s_mov_b32 s11, s7
+; GFX6-NEXT: s_mov_b32 s14, s6
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s2
; GFX6-NEXT: s_mov_b32 s13, s3
-; GFX6-NEXT: s_mov_b32 s14, s6
; GFX6-NEXT: s_mov_b32 s15, s7
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; GFX6-NEXT: s_mov_b32 s10, s6
+; GFX6-NEXT: s_mov_b32 s11, s7
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
@@ -757,20 +757,20 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_mov_b32 s14, s6
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s12, s2
; GFX8-NEXT: s_mov_b32 s13, s3
-; GFX8-NEXT: s_mov_b32 s14, s6
; GFX8-NEXT: s_mov_b32 s15, s7
-; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; GFX8-NEXT: s_mov_b32 s10, s6
+; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 120aebf2bf7c8..b629551a81bbf 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -33,6 +33,7 @@ define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
; GFX10-LABEL: shuffle3744:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -116,6 +117,7 @@ define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
; GFX10-LABEL: shuffle1004:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -147,6 +149,7 @@ define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, pt
; GFX10-LABEL: shuffle7533:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: flat_load_dword v6, v[0:1]
; GFX10-NEXT: flat_load_dword v7, v[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -295,6 +298,7 @@ define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
; GFX10-LABEL: shuffle3546:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -564,6 +568,7 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
; GFX10-LABEL: addUsesOr:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -613,6 +618,7 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -688,6 +694,7 @@ define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, p
; GFX10-LABEL: add:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -744,6 +751,7 @@ define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %el
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -794,6 +802,7 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
; GFX10-LABEL: add_store:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -853,6 +862,7 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -921,6 +931,7 @@ define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -991,6 +1002,7 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v0, 2
@@ -1129,6 +1141,7 @@ define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1178,6 +1191,7 @@ define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1231,6 +1245,7 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v9
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: global_load_dword v10, v[2:3], off
; GFX10-NEXT: v_mov_b32_e32 v0, 16
@@ -1298,6 +1313,7 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: v_mov_b32_e32 v0, 26
@@ -1367,6 +1383,7 @@ define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1437,6 +1454,7 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v0, 16
@@ -1501,33 +1519,34 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: global_load_dword v4, v[2:3], off
-; GFX10-NEXT: global_load_dword v9, v[0:1], off
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12
-; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14
-; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
+; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0
; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15
; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16
-; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3
; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3
; GFX10-NEXT: v_or_b32_e32 v0, 1, v0
; GFX10-NEXT: v_trunc_f32_e32 v15, v15
; GFX10-NEXT: v_trunc_f32_e32 v16, v16
@@ -1562,7 +1581,7 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706
+; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x60706
; GFX10-NEXT: global_store_dword v[5:6], v0, off
; GFX10-NEXT: global_store_dword v[7:8], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -1576,26 +1595,26 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: global_load_dword v9, v[0:1], off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v9, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x60706
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
-; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT: v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4
+; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT: v_xor_b32_sdwa v4, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v9, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12
; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v9
; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15
; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16
; GFX9-NEXT: v_trunc_f32_e32 v15, v15
@@ -1610,23 +1629,23 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_trunc_f32_e32 v18, v18
; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v2|
-; GFX9-NEXT: v_ashrrev_i32_e32 v9, 30, v9
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v4
; GFX9-NEXT: v_or_b32_e32 v10, 1, v10
; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15
; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16
; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17
-; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2
+; GFX9-NEXT: v_mad_f32 v2, -v18, v9, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12|
; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14
-; GFX9-NEXT: v_or_b32_e32 v9, 1, v9
+; GFX9-NEXT: v_or_b32_e32 v4, 1, v4
; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13|
; GFX9-NEXT: v_or_b32_e32 v14, 1, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4|
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v9|
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc
; GFX9-NEXT: v_add_u32_e32 v1, v15, v1
; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1662,6 +1681,7 @@ define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1726,6 +1746,7 @@ define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1795,6 +1816,7 @@ define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1858,6 +1880,7 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -2037,6 +2060,7 @@ define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v2, v[2:3], off
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -2106,6 +2130,7 @@ define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2153,6 +2178,7 @@ define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: v_mov_b32_e32 v0, 1
@@ -2223,6 +2249,7 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v2, v[2:3], off
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -2280,45 +2307,45 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: global_load_dword v9, v[0:1], off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v9, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x40207
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
+; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v9
; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v9
; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9
-; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4
+; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v9
; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10
; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11
-; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
-; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v4, v4
+; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4
+; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v9, v9
; GFX9-NEXT: v_trunc_f32_e32 v11, v11
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v9
; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12
; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, v9
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
; GFX9-NEXT: v_trunc_f32_e32 v12, v12
-; GFX9-NEXT: v_mul_f32_e32 v13, v9, v13
+; GFX9-NEXT: v_mul_f32_e32 v13, v4, v13
; GFX9-NEXT: v_mad_f32 v15, -v12, v3, v10
; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12
; GFX9-NEXT: v_trunc_f32_e32 v13, v13
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2
; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14
-; GFX9-NEXT: v_mad_f32 v9, -v13, v10, v9
+; GFX9-NEXT: v_mad_f32 v4, -v13, v10, v4
; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
; GFX9-NEXT: v_trunc_f32_e32 v14, v14
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, v3
-; GFX9-NEXT: v_mad_f32 v16, -v14, v4, v2
+; GFX9-NEXT: v_mad_f32 v16, -v14, v9, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v12, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, v10
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v10
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v4
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v9
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v14, vcc
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
@@ -2352,6 +2379,7 @@ define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -2411,6 +2439,7 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v2, v[2:3], off
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -2556,6 +2585,7 @@ define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffff00
@@ -2631,6 +2661,7 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v9, v[2:3], off
; GFX10-NEXT: v_mov_b32_e32 v0, 0xff
@@ -2724,6 +2755,7 @@ define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
; GFX10-LABEL: extract3744:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2768,6 +2800,7 @@ define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %i
; GFX10-LABEL: extract_perm_3744:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2801,6 +2834,7 @@ define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %i
; GFX10-LABEL: extract1347_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2858,6 +2892,7 @@ define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshri16_8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2898,6 +2933,7 @@ define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshri16_16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2938,6 +2974,7 @@ define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshri16_24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2978,6 +3015,7 @@ define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshri16_32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3018,6 +3056,7 @@ define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshri16_88:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3060,6 +3099,7 @@ define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: fshli16_1347:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3100,6 +3140,7 @@ define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshli16_16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3140,6 +3181,7 @@ define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshli16_24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3180,6 +3222,7 @@ define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshli16_32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3220,6 +3263,7 @@ define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
; GFX10-LABEL: fshli16_88:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3260,6 +3304,7 @@ define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ad
; GFX10-LABEL: shlbase:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v7, v[0:1], off
; GFX10-NEXT: global_load_dword v8, v[2:3], off
; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v6
@@ -3321,6 +3366,7 @@ define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
; GFX10-LABEL: extractbase:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v7, v[0:1], off
; GFX10-NEXT: global_load_dword v8, v[2:3], off
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v6
@@ -3379,6 +3425,7 @@ define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_hilo:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[2:3], off
; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3421,6 +3468,7 @@ define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_lohi:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4
; GFX10-NEXT: global_load_dword v7, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3463,6 +3511,7 @@ define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_hihi:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4
; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3584,6 +3633,7 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_3src:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index e452af7d60c0c..c206e1536aa68 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -102,14 +102,14 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v31, v0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
@@ -172,6 +172,7 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -238,6 +239,7 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
@@ -480,14 +482,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX900-NEXT: v_mov_b32_e32 v31, v0
-; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX900-NEXT: v_mov_b32_e32 v31, v0
; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0
; GFX900-NEXT: v_and_b32_e32 v6, 0xfe000000, v0
@@ -597,6 +599,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -709,14 +712,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v31, v0
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_mov_b32 s32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX90A-NEXT: v_and_b32_e32 v1, 0xff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0
; GFX90A-NEXT: v_and_b32_e32 v0, 0xfe000000, v0
@@ -817,6 +820,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
@@ -1126,14 +1130,14 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v31, v0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
@@ -1183,6 +1187,7 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -1243,6 +1248,7 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
@@ -1409,14 +1415,14 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v31, v0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
@@ -1458,6 +1464,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -1501,6 +1508,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
@@ -1631,14 +1639,14 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v31, v0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
@@ -1676,6 +1684,7 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -1714,6 +1723,7 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
@@ -1812,13 +1822,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x2000, v0
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x2800, v12
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x3000, v12
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0x3800, v12
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
@@ -1852,51 +1862,51 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v31, v0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff8000, v0
+; GFX9-NEXT: v_and_b32_e32 v14, 0xffff8000, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s37
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s36, v16
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s36, v14
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s39
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s38, v16
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v0, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, s38, v14
+; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v0, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x2000, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2048
-; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x3000, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc
-; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048
-; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off
-; GFX9-NEXT: global_load_dwordx2 v[14:15], v[2:3], off offset:2048
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x2000, v15
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v16, vcc
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[8:9], v[0:1], off offset:2048
+; GFX9-NEXT: global_load_dwordx2 v[10:11], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[12:13], v[4:5], off offset:2048
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v15
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v16, vcc
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v7, vcc
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v13, v11, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v12
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v13, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v15, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx2 v16, v[0:1], s[36:37]
+; GFX9-NEXT: global_store_dwordx2 v14, v[0:1], s[36:37]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: DiffBase:
@@ -1911,6 +1921,7 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -1939,6 +1950,7 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX10-NEXT: global_load_dwordx2 v[10:11], v[2:3], off
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v12
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[12:13], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[14:15], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(4)
@@ -1964,6 +1976,7 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[36:39], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
@@ -1987,8 +2000,8 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x3000, v8
; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:-4096
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off offset:2048
; GFX11-NEXT: global_load_b64 v[10:11], v[8:9], off
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
@@ -2142,14 +2155,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v31, v0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
@@ -2211,6 +2224,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -2281,6 +2295,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
@@ -2437,14 +2452,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v31, v0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
@@ -2477,6 +2492,7 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v31, v0
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -2511,6 +2527,7 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index d89e57245e8ea..af4475907823b 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -3212,32 +3212,31 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %i21, ptr addrspace(1) nocapture noundef writeonly align 4 %arg, i32 noundef %arg1) #1 {
; GFX67-LABEL: compute_mad:
; GFX67: ; %bb.0: ; %bb
-; GFX67-NEXT: s_load_dword s0, s[4:5], 0x6
+; GFX67-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GFX67-NEXT: s_load_dword s2, s[4:5], 0x6
+; GFX67-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4
; GFX67-NEXT: s_mov_b32 s3, 0xf000
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: s_add_i32 s0, s0, 1
-; GFX67-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX67-NEXT: v_add_i32_e32 v2, vcc, s0, v1
+; GFX67-NEXT: s_load_dwordx2 s[4:5], s[12:13], 0x0
+; GFX67-NEXT: s_load_dword s6, s[14:15], 0x1
+; GFX67-NEXT: s_add_i32 s2, s2, 1
+; GFX67-NEXT: v_mul_lo_u32 v1, s2, v0
+; GFX67-NEXT: v_add_i32_e32 v2, vcc, s2, v1
; GFX67-NEXT: v_mul_lo_u32 v2, v2, v0
; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
-; GFX67-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4
-; GFX67-NEXT: v_mul_lo_u32 v3, v2, v1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: s_load_dword s2, s[14:15], 0x1
-; GFX67-NEXT: s_load_dwordx2 s[4:5], s[12:13], 0x0
+; GFX67-NEXT: s_and_b32 s2, s6, 0xffff
+; GFX67-NEXT: v_mul_lo_u32 v3, v2, v1
+; GFX67-NEXT: s_mul_i32 s8, s8, s2
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, s8, v0
; GFX67-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GFX67-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v3
-; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX67-NEXT: s_mov_b32 s2, 0
; GFX67-NEXT: v_mul_lo_u32 v3, v1, v2
-; GFX67-NEXT: s_mul_i32 s8, s8, s2
-; GFX67-NEXT: v_add_i32_e32 v0, vcc, s8, v0
; GFX67-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX67-NEXT: v_mul_lo_u32 v1, v2, v1
; GFX67-NEXT: v_mov_b32_e32 v2, s5
-; GFX67-NEXT: s_mov_b32 s2, 0
; GFX67-NEXT: v_mul_lo_u32 v3, v1, v3
; GFX67-NEXT: v_add_i32_e32 v3, vcc, v3, v1
; GFX67-NEXT: v_mul_lo_u32 v4, v3, v1
@@ -3250,31 +3249,30 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
;
; GFX8-LABEL: compute_mad:
; GFX8: ; %bb.0: ; %bb
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x18
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s0, s0, 1
-; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
-; GFX8-NEXT: v_mul_lo_u32 v2, v2, v0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s6, s[4:5], 0x18
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
-; GFX8-NEXT: v_mul_lo_u32 v3, v2, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x4
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX8-NEXT: v_mul_lo_u32 v3, v1, v2
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x4
+; GFX8-NEXT: s_add_i32 s6, s6, 1
+; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v1
+; GFX8-NEXT: v_mul_lo_u32 v2, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1
; GFX8-NEXT: s_mul_i32 s8, s8, s2
+; GFX8-NEXT: v_mul_lo_u32 v3, v2, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3
+; GFX8-NEXT: v_mul_lo_u32 v3, v1, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mul_lo_u32 v3, v1, v3
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
@@ -3289,29 +3287,28 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
;
; GFX900-LABEL: compute_mad:
; GFX900: ; %bb.0: ; %bb
-; GFX900-NEXT: s_load_dword s0, s[4:5], 0x18
-; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_add_i32 s0, s0, 1
-; GFX900-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX900-NEXT: v_add_u32_e32 v2, s0, v1
-; GFX900-NEXT: v_mul_lo_u32 v2, v2, v0
-; GFX900-NEXT: v_add_u32_e32 v1, 1, v1
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_load_dword s9, s[4:5], 0x18
; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_load_dword s9, s[2:3], 0x4
; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX900-NEXT: v_mul_lo_u32 v3, v2, v1
+; GFX900-NEXT: s_load_dword s10, s[2:3], 0x4
+; GFX900-NEXT: s_add_i32 s9, s9, 1
+; GFX900-NEXT: v_mul_lo_u32 v1, s9, v0
; GFX900-NEXT: v_mov_b32_e32 v5, s7
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX900-NEXT: v_mov_b32_e32 v4, s5
+; GFX900-NEXT: s_and_b32 s0, s10, 0xffff
+; GFX900-NEXT: v_add_u32_e32 v2, s9, v1
+; GFX900-NEXT: v_mul_lo_u32 v2, v2, v0
+; GFX900-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX900-NEXT: s_mul_i32 s8, s8, s0
+; GFX900-NEXT: v_add_u32_e32 v0, s8, v0
+; GFX900-NEXT: v_mul_lo_u32 v3, v2, v1
; GFX900-NEXT: v_add_u32_e32 v1, v3, v1
; GFX900-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX900-NEXT: v_add_u32_e32 v2, 1, v3
-; GFX900-NEXT: s_mul_i32 s8, s8, s0
-; GFX900-NEXT: v_add_u32_e32 v0, s8, v0
; GFX900-NEXT: v_mul_lo_u32 v3, v1, v2
-; GFX900-NEXT: v_mov_b32_e32 v4, s5
; GFX900-NEXT: v_add_u32_e32 v2, v3, v2
; GFX900-NEXT: v_mul_lo_u32 v1, v2, v1
; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v3, v[1:2]
@@ -3326,11 +3323,13 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
;
; GFX90A-LABEL: compute_mad:
; GFX90A: ; %bb.0: ; %bb
-; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x4
; GFX90A-NEXT: s_add_i32 s9, s9, 1
; GFX90A-NEXT: v_mul_lo_u32 v0, s9, v4
; GFX90A-NEXT: v_add_u32_e32 v1, s9, v0
@@ -3341,18 +3340,16 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
; GFX90A-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX90A-NEXT: v_add_u32_e32 v1, 1, v2
; GFX90A-NEXT: v_mul_lo_u32 v2, v0, v1
-; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x4
; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1
; GFX90A-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v0, v2, v[0:1]
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, v0, v[2:3]
+; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v2, v[0:1]
+; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v0, v[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX90A-NEXT: s_mul_i32 s8, s8, s2
+; GFX90A-NEXT: s_and_b32 s0, s10, 0xffff
+; GFX90A-NEXT: s_mul_i32 s8, s8, s0
; GFX90A-NEXT: v_add_u32_e32 v1, s8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, s1
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s4, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX90A-NEXT: v_mov_b32_e32 v1, s7
@@ -3363,25 +3360,26 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
;
; GFX10-LABEL: compute_mad:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dword s0, s[4:5], 0x18
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s0, s0, 1
-; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v2, s0, v1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: s_load_dword s6, s[4:5], 0x18
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
-; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0
-; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x4
+; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: v_mul_lo_u32 v1, s6, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v2, s6, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-NEXT: v_mul_lo_u32 v2, v1, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v3
; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s8, s2, v[0:1]
; GFX10-NEXT: v_mul_lo_u32 v1, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 0a746b0a3f572..9a5bf220d200c 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -344,13 +344,13 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: flat_load_ushort v1, v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v0
-; GFX8-NEXT: v_sub_u16_e32 v2, 0, v2
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT: v_lshlrev_b16_e32 v2, v1, v0
+; GFX8-NEXT: v_sub_u16_e32 v1, 0, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v0, v1, v0
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT: flat_store_short v[0:1], v2
@@ -360,6 +360,7 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX10-LABEL: test_rotl_i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v6, v[2:3], off offset:48
; GFX10-NEXT: global_load_ushort v7, v[0:1], off offset:32
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -374,6 +375,7 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-TRUE16-LABEL: test_rotl_i16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -389,6 +391,7 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-FAKE16-LABEL: test_rotl_i16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v2, v[2:3], off offset:48
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off offset:32
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6e361d6e297e..4a13418405efb 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -301,13 +301,13 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: flat_load_ushort v1, v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v0
-; GFX8-NEXT: v_sub_u16_e32 v2, 0, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, v1, v0
+; GFX8-NEXT: v_sub_u16_e32 v1, 0, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v1, v0
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT: flat_store_short v[0:1], v2
@@ -317,6 +317,7 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX10-LABEL: test_rotr_i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v6, v[2:3], off offset:48
; GFX10-NEXT: global_load_ushort v7, v[0:1], off offset:32
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -331,6 +332,7 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-TRUE16-LABEL: test_rotr_i16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -346,6 +348,7 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-FAKE16-LABEL: test_rotr_i16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v2, v[2:3], off offset:48
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off offset:32
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
index 8ad6a4e534d23..860223d4192ae 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
@@ -7,15 +7,15 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs
; CHECK-LABEL: extracted_values:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v3, v3
; CHECK-NEXT: ds_read_b32 v4, v4
-; CHECK-NEXT: ds_read_b32 v2, v2
; CHECK-NEXT: ds_read_b32 v5, v5
-; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_read_b32 v3, v3
+; CHECK-NEXT: ds_read_b32 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: v_sub_f16_sdwa v6, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; CHECK-NEXT: v_sub_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_sub_f16_sdwa v7, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; CHECK-NEXT: v_sub_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: v_add_f16_e32 v4, v6, v7
; CHECK-NEXT: v_add_f16_e32 v2, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 21719226710de..b6d47267e23df 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -764,81 +764,81 @@ define amdgpu_kernel void @select_v2f16(
; SI-NEXT: s_mov_b32 s21, s13
; SI-NEXT: s_mov_b32 s22, s2
; SI-NEXT: s_mov_b32 s23, s3
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: s_mov_b32 s12, s14
; SI-NEXT: s_mov_b32 s13, s15
-; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
-; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0
-; SI-NEXT: buffer_load_dword v3, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
-; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
+; SI-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_v2f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s22, s6
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
+; VI-NEXT: s_mov_b32 s18, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s20, s12
-; VI-NEXT: s_mov_b32 s21, s13
-; VI-NEXT: s_mov_b32 s23, s7
; VI-NEXT: s_mov_b32 s16, s10
; VI-NEXT: s_mov_b32 s17, s11
-; VI-NEXT: s_mov_b32 s18, s6
; VI-NEXT: s_mov_b32 s19, s7
-; VI-NEXT: buffer_load_dword v0, off, s[20:23], 0
-; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s20, s12
+; VI-NEXT: s_mov_b32 s21, s13
+; VI-NEXT: s_mov_b32 s22, s6
+; VI-NEXT: s_mov_b32 s23, s7
+; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0
; VI-NEXT: s_mov_b32 s12, s14
; VI-NEXT: s_mov_b32 s13, s15
; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_mov_b32 s15, s7
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], 0
-; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s2, s6
+; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], 0
; VI-NEXT: s_mov_b32 s4, s8
; VI-NEXT: s_mov_b32 s5, s9
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4
-; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v1, v0
+; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v0, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[0:1]
-; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -850,39 +850,41 @@ define amdgpu_kernel void @select_v2f16(
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x44
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12
-; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s11
-; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[20:23], 0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12
+; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[4:7], 0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v3, off, s[24:27], 0
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[24:27], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v3, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -897,40 +899,42 @@ define amdgpu_kernel void @select_v2f16(
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x44
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s22, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s23, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s22, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s23, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s26, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s27, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s20, s12
-; GFX11-FAKE16-NEXT: s_mov_b32 s21, s13
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s11
+; GFX11-FAKE16-NEXT: s_mov_b32 s20, s12
+; GFX11-FAKE16-NEXT: s_mov_b32 s21, s13
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[20:23], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s24, s14
; GFX11-FAKE16-NEXT: s_mov_b32 s25, s15
-; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[20:23], 0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[16:19], 0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v3, off, s[24:27], 0
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[24:27], 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v3, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, s8
; GFX11-FAKE16-NEXT: s_mov_b32 s1, s9
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v5, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v6 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_endpgm
@@ -1052,6 +1056,7 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
@@ -1091,6 +1096,7 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
@@ -1228,6 +1234,7 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
@@ -1267,6 +1274,7 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
@@ -1355,34 +1363,34 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
-; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, 0x3900
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v5, v4
-; VI-NEXT: v_mov_b32_e32 v1, 0x3900
; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
@@ -1393,28 +1401,29 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
-; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
-; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
+; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1446,6 +1455,7 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
@@ -1534,34 +1544,34 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
-; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, 0x3900
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4
-; VI-NEXT: v_mov_b32_e32 v1, 0x3900
; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
@@ -1572,28 +1582,29 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
-; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
-; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
+; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1625,6 +1636,7 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
@@ -3496,22 +3508,22 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32>
; GFX11-FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:128
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v30
@@ -3590,69 +3602,57 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32>
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v98, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v14, v97, v14, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68
; GFX11-FAKE16-NEXT: v_perm_b32 v13, v99, v13, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82
; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v4, v50, v4, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85
; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v32, v0, 0x5040100
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 0b68a0534fa08..fbf4ccfe82d6a 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -382,20 +382,20 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s10
; SI-NEXT: s_mov_b32 s13, s11
-; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
-; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0
+; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
-; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
+; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_mov_b32 s0, s8
@@ -409,20 +409,20 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s10
; VI-NEXT: s_mov_b32 s13, s11
-; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
-; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0
+; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
-; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
+; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s0, s8
@@ -437,22 +437,23 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11
-; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 1.0, v1
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
@@ -466,22 +467,23 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11
-; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 1.0, v1
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index a3aeea8a145cd..474a69845e635 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -710,19 +710,19 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX6-NEXT: s_mov_b32 s11, 0xf000
; GFX6-NEXT: s_mov_b32 s14, 0
; GFX6-NEXT: s_mov_b32 s15, s11
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15]
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3]
-; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[12:15], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: s_mov_b32 s8, s0
; GFX6-NEXT: s_mov_b32 s9, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GFX6-NEXT: s_endpgm
;
@@ -800,21 +800,21 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s11, 0xf000
; GFX6-NEXT: s_mov_b32 s14, 0
; GFX6-NEXT: s_mov_b32 s15, s11
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, 0
; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15]
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3]
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[12:15], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: s_mov_b32 s8, s0
; GFX6-NEXT: s_mov_b32 s9, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v6, v2
-; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GFX6-NEXT: s_endpgm
;
@@ -935,14 +935,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v0
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v0
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
-; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
+; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[10:11]
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v8
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v12
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v10
+; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GFX8-NEXT: v_mov_b32_e32 v17, s1
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 42bd2ff8797a1..1997e2bb10678 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -97,19 +97,17 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
; GFX9-LABEL: s_test_sub_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0
-; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s5, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s10
-; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_pk_sub_i16 v0, s4, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: s_test_sub_v2i16:
@@ -139,6 +137,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
@@ -154,6 +153,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index eb1b844ad8938..4f622ecb7925d 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -382,20 +382,20 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s10
; SI-NEXT: s_mov_b32 s13, s11
-; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
-; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0
+; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
-; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
+; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_mov_b32 s0, s8
@@ -409,20 +409,20 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s10
; VI-NEXT: s_mov_b32 s13, s11
-; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
-; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0
+; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
-; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
+; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s0, s8
@@ -437,22 +437,23 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11
-; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 1.0, v1
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
@@ -466,22 +467,23 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11
-; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 1.0, v1
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index 580938f922a04..fd5eb5d80495a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -67,6 +67,7 @@ define amdgpu_kernel void @madak_f16(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
@@ -92,6 +93,7 @@ define amdgpu_kernel void @madak_f16(
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
@@ -223,6 +225,7 @@ define amdgpu_kernel void @madak_f16_use_2(
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 0x4900, v0.h
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0x4900, v0.l
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -260,6 +263,7 @@ define amdgpu_kernel void @madak_f16_use_2(
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 0x4900, v1
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 0x4900, v0
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
index bce7c1e5e8ab7..51fc72be41f36 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
@@ -2080,8 +2080,8 @@ define double @test_vector_reduce_fadd_v16double(double %sp, <16 x double> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_fadd_v16double:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8
; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32
+; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8
; GFX9-SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
@@ -2097,7 +2097,7 @@ define double @test_vector_reduce_fadd_v16double(double %sp, <16 x double> %v) {
; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[24:25]
; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[26:27]
; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[28:29]
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(2)
; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[30:31]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[32:33]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
index 657fe0f0804f3..3b8c3de3e5433 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
@@ -2080,8 +2080,8 @@ define double @test_vector_reduce_fmul_v16double(double %sp, <16 x double> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_fmul_v16double:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8
; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32
+; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8
; GFX9-SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
@@ -2097,7 +2097,7 @@ define double @test_vector_reduce_fmul_v16double(double %sp, <16 x double> %v) {
; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[24:25]
; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[26:27]
; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[28:29]
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(2)
; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[30:31]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[32:33]
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 58602a1ccd5ba..10bf1358b2aa8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -58,6 +58,7 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_234u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -69,6 +70,7 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_234u:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -198,6 +200,7 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_3u6u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -209,6 +212,7 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_3u6u:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -219,6 +223,7 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_3u6u:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -257,6 +262,7 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_3uu7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -268,6 +274,7 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_3uu7:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -278,6 +285,7 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_3uu7:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -316,6 +324,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_35u5:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -326,6 +335,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_35u5:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -338,6 +348,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_35u5:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -377,6 +388,7 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_357u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -388,6 +400,7 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_357u:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -401,6 +414,7 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_357u:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -486,6 +500,7 @@ define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_0145:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -497,6 +512,7 @@ define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_0145:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -522,6 +538,7 @@ define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_0167:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -533,6 +550,7 @@ define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_0167:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -627,6 +645,7 @@ define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_2345:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -638,6 +657,7 @@ define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_2345:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -663,6 +683,7 @@ define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_2367:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -674,6 +695,7 @@ define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_2367:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -699,6 +721,7 @@ define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_4501:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -710,6 +733,7 @@ define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_4501:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -737,6 +761,7 @@ define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_4523:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -748,6 +773,7 @@ define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_4523:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -832,6 +858,7 @@ define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_6701:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -843,6 +870,7 @@ define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_6701:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -870,6 +898,7 @@ define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_6723:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -881,6 +910,7 @@ define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v4f16_6723:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -988,6 +1018,7 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_2356:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -999,6 +1030,7 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_2356:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1010,6 +1042,7 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_2356:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1048,6 +1081,7 @@ define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_5623:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1059,6 +1093,7 @@ define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_5623:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1070,6 +1105,7 @@ define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_5623:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1097,6 +1133,7 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_3456:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1108,6 +1145,7 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_3456:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1121,6 +1159,7 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_3456:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1149,6 +1188,7 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_5634:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1160,6 +1200,7 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_5634:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1173,6 +1214,7 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_5634:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1214,6 +1256,7 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_5734:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1225,6 +1268,7 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_5734:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1237,6 +1281,7 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_5734:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1276,6 +1321,7 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a
; GFX10-LABEL: shuffle_v4i16_2356:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1287,6 +1333,7 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a
; GFX11-TRUE16-LABEL: shuffle_v4i16_2356:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1299,6 +1346,7 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a
; GFX11-FAKE16-LABEL: shuffle_v4i16_2356:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1326,6 +1374,7 @@ define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %a
; GFX10-LABEL: shuffle_v4i16_0167:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1337,6 +1386,7 @@ define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %a
; GFX11-LABEL: shuffle_v4i16_0167:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1528,6 +1578,7 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_6161:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1538,6 +1589,7 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_6161:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1551,6 +1603,7 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_6161:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -1736,6 +1789,7 @@ define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v8f16_4589:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1747,6 +1801,7 @@ define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-LABEL: shuffle_v8f16_4589:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1772,6 +1827,7 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace
; GFX10-LABEL: shuffle_v8f16_10_11_2_3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1783,6 +1839,7 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace
; GFX11-LABEL: shuffle_v8f16_10_11_2_3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -1821,6 +1878,7 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace
; GFX10-LABEL: shuffle_v8f16_13_14_2_3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1832,6 +1890,7 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace
; GFX11-TRUE16-LABEL: shuffle_v8f16_13_14_2_3:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1843,6 +1902,7 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace
; GFX11-FAKE16-LABEL: shuffle_v8f16_13_14_2_3:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -1980,6 +2040,7 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off
; GFX10-NEXT: global_load_dword v7, v[3:4], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1992,6 +2053,7 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX11-NEXT: global_load_b32 v3, v[3:4], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -2138,6 +2200,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_0456:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
@@ -2150,6 +2213,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_0456:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -2161,6 +2225,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_0456:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -2255,6 +2320,7 @@ define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-LABEL: low16bits_v2f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2264,6 +2330,7 @@ define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: low16bits_v2f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -2273,6 +2340,7 @@ define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-FAKE16-LABEL: low16bits_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -2310,6 +2378,7 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-LABEL: hi16bits_v2f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2319,6 +2388,7 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: hi16bits_v2f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -2331,6 +2401,7 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-FAKE16-LABEL: hi16bits_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -2368,6 +2439,7 @@ define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x
; GFX10-LABEL: low16hi16bits_v2f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2377,6 +2449,7 @@ define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x
; GFX11-LABEL: low16hi16bits_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2403,6 +2476,7 @@ define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %
; GFX10-LABEL: hi16low16bits_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2412,6 +2486,7 @@ define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: hi16low16bits_v2bf16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -2423,6 +2498,7 @@ define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: hi16low16bits_v2bf16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -2460,6 +2536,7 @@ define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-LABEL: i16_low16bits:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2469,6 +2546,7 @@ define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: i16_low16bits:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -2478,6 +2556,7 @@ define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-FAKE16-LABEL: i16_low16bits:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -2515,6 +2594,7 @@ define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX10-LABEL: i16_low16hi16bits:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2524,6 +2604,7 @@ define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-LABEL: i16_low16hi16bits:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2550,6 +2631,7 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX10-LABEL: i16_hi16low16bits:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2559,6 +2641,7 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-TRUE16-LABEL: i16_hi16low16bits:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -2571,6 +2654,7 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-FAKE16-LABEL: i16_hi16low16bits:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -2608,6 +2692,7 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-LABEL: i16_hi16bits:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2617,6 +2702,7 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: i16_hi16bits:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -2630,6 +2716,7 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-FAKE16-LABEL: i16_hi16bits:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -2721,6 +2808,7 @@ define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX10-LABEL: shuffle_v8f16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2730,6 +2818,7 @@ define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX11-LABEL: shuffle_v8f16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2758,6 +2847,7 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX10-LABEL: shuffle_v16f16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -2769,6 +2859,7 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX11-LABEL: shuffle_v16f16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -2805,10 +2896,9 @@ define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX10-LABEL: shuffle_v32f16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
-; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(3)
@@ -2824,10 +2914,9 @@ define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX11-LABEL: shuffle_v32f16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(3)
@@ -2860,6 +2949,7 @@ define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX10-LABEL: shuffle_v8i16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2869,6 +2959,7 @@ define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX11-LABEL: shuffle_v8i16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2897,6 +2988,7 @@ define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX10-LABEL: shuffle_v16i16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -2908,6 +3000,7 @@ define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX11-LABEL: shuffle_v16i16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -2944,10 +3037,9 @@ define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX10-LABEL: shuffle_v32i16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
-; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(3)
@@ -2963,10 +3055,9 @@ define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX11-LABEL: shuffle_v32i16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(3)
@@ -3012,6 +3103,7 @@ define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
; GFX10-LABEL: shuffle_v4i8_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3030,6 +3122,7 @@ define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
; GFX11-FAKE16-LABEL: shuffle_v4i8_concat:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -3056,6 +3149,7 @@ define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
; GFX10-LABEL: shuffle_v8i8_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3065,6 +3159,7 @@ define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
; GFX11-LABEL: shuffle_v8i8_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3091,6 +3186,7 @@ define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX10-LABEL: shuffle_v16i8_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3100,6 +3196,7 @@ define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX11-LABEL: shuffle_v16i8_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3128,6 +3225,7 @@ define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX10-LABEL: shuffle_v32i8_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3139,6 +3237,7 @@ define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX11-LABEL: shuffle_v32i8_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -3167,6 +3266,7 @@ define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX10-LABEL: shuffle_v4i32_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3176,6 +3276,7 @@ define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX11-LABEL: shuffle_v4i32_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3204,6 +3305,7 @@ define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX10-LABEL: shuffle_v8i32_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3215,6 +3317,7 @@ define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
; GFX11-LABEL: shuffle_v8i32_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -3251,10 +3354,9 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX10-LABEL: shuffle_v16i32_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
-; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(3)
@@ -3270,10 +3372,9 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX11-LABEL: shuffle_v16i32_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(3)
@@ -3345,6 +3446,7 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_234u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3356,6 +3458,7 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_234u:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3485,6 +3588,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_3u6u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3496,6 +3600,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_3u6u:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -3506,6 +3611,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_3u6u:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -3544,6 +3650,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_3uu7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3555,6 +3662,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_3uu7:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -3565,6 +3673,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_3uu7:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -3603,6 +3712,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_35u5:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3613,6 +3723,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_35u5:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -3625,6 +3736,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_35u5:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -3664,6 +3776,7 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_357u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3675,6 +3788,7 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_357u:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -3688,6 +3802,7 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -3773,6 +3888,7 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_0145:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3784,6 +3900,7 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_0145:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3809,6 +3926,7 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_0167:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3820,6 +3938,7 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_0167:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3914,6 +4033,7 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_2345:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3925,6 +4045,7 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_2345:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3950,6 +4071,7 @@ define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_2367:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3961,6 +4083,7 @@ define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_2367:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3986,6 +4109,7 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_4501:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -3997,6 +4121,7 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_4501:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -4024,6 +4149,7 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_4523:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4035,6 +4161,7 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_4523:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -4119,6 +4246,7 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_6701:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4130,6 +4258,7 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_6701:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -4157,6 +4286,7 @@ define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_6723:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4168,6 +4298,7 @@ define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v4bf16_6723:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -4275,6 +4406,7 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_2356:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4286,6 +4418,7 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_2356:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -4297,6 +4430,7 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_2356:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -4335,6 +4469,7 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_5623:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4346,6 +4481,7 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_5623:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -4357,6 +4493,7 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_5623:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -4384,6 +4521,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_3456:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4395,6 +4533,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_3456:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -4408,6 +4547,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_3456:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -4436,6 +4576,7 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_5634:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4447,6 +4588,7 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_5634:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -4460,6 +4602,7 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_5634:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -4501,6 +4644,7 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_5734:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4512,6 +4656,7 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_5734:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -4524,6 +4669,7 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_5734:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -4718,6 +4864,7 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_6161:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4728,6 +4875,7 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_6161:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -4741,6 +4889,7 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_6161:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -4926,6 +5075,7 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v8bf16_4589:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4937,6 +5087,7 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-LABEL: shuffle_v8bf16_4589:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -4962,6 +5113,7 @@ define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrsp
; GFX10-LABEL: shuffle_v8bf16_10_11_2_3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -4973,6 +5125,7 @@ define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrsp
; GFX11-LABEL: shuffle_v8bf16_10_11_2_3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -5011,6 +5164,7 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
; GFX10-LABEL: shuffle_v8bf16_13_14_2_3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -5022,6 +5176,7 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
; GFX11-TRUE16-LABEL: shuffle_v8bf16_13_14_2_3:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -5033,6 +5188,7 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
; GFX11-FAKE16-LABEL: shuffle_v8bf16_13_14_2_3:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
@@ -5170,6 +5326,7 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off
; GFX10-NEXT: global_load_dword v7, v[3:4], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -5182,6 +5339,7 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX11-NEXT: global_load_b32 v3, v[3:4], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -5696,6 +5854,7 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_0456:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
@@ -5708,6 +5867,7 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_0456:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -5719,6 +5879,7 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_0456:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -5755,6 +5916,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-LABEL: low16bits:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5764,6 +5926,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: low16bits:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -5773,6 +5936,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-FAKE16-LABEL: low16bits:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -5810,6 +5974,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX10-LABEL: hi16bits_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5819,6 +5984,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-TRUE16-LABEL: hi16bits_v2bf16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -5831,6 +5997,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-FAKE16-LABEL: hi16bits_v2bf16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -5868,6 +6035,7 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1)
; GFX10-LABEL: low16hi16bits_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5877,6 +6045,7 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1)
; GFX11-TRUE16-LABEL: low16hi16bits_v2bf16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -5886,6 +6055,7 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1)
; GFX11-FAKE16-LABEL: low16hi16bits_v2bf16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -5912,6 +6082,7 @@ define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-LABEL: hi16low16bits:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5921,6 +6092,7 @@ define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: hi16low16bits:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -5932,6 +6104,7 @@ define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-FAKE16-LABEL: hi16low16bits:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -5991,6 +6164,7 @@ define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX10-LABEL: shuffle_v8bf16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -6000,6 +6174,7 @@ define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX11-LABEL: shuffle_v8bf16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6028,6 +6203,7 @@ define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %ar
; GFX10-LABEL: shuffle_v16bf16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -6039,6 +6215,7 @@ define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %ar
; GFX11-LABEL: shuffle_v16bf16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -6075,10 +6252,9 @@ define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %ar
; GFX10-LABEL: shuffle_v32bf16_concat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
-; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(3)
@@ -6094,10 +6270,9 @@ define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %ar
; GFX11-LABEL: shuffle_v32bf16_concat:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(3)
diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index 4ce71e1de039b..78490ed6610a2 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -94,25 +94,23 @@ define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: test_select_v2f32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v1
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: test_select_v2f32:
@@ -266,33 +264,31 @@ define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: test_select_v4f32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v3
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v2
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s1, v1
+; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: v_cmp_neq_f32_e32 vcc, s5, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: test_select_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
index d10dfcaeba7cc..cf6614f1f3d5e 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
@@ -13,10 +13,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: s_endpgm
@@ -43,10 +42,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: s_endpgm
@@ -71,6 +69,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_endpgm
@@ -95,6 +94,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_endpgm
@@ -121,10 +121,9 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_endpgm
@@ -151,10 +150,9 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: s_endpgm
@@ -181,10 +179,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_endpgm
@@ -211,10 +208,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_endpgm
@@ -241,10 +237,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_endpgm
@@ -271,10 +266,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
-; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
-; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index 311e76b9bb2b0..f126cadb11247 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -23,6 +23,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
@@ -67,6 +68,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
@@ -105,6 +107,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
@@ -143,6 +146,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
@@ -187,6 +191,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
@@ -221,6 +226,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: s_endpgm
@@ -247,6 +253,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
; GFX12-NEXT: s_endpgm
@@ -283,6 +290,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
@@ -327,6 +335,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
@@ -371,6 +380,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
@@ -415,6 +425,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index b7b6028c86dca..fc0e9354021b4 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -25,10 +25,9 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_endpgm
@@ -47,10 +46,9 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_endpgm
@@ -69,10 +67,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_endpgm
@@ -89,10 +86,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_endpgm
@@ -111,10 +107,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_endpgm
@@ -131,10 +126,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_endpgm
@@ -153,10 +147,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15]
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_endpgm
@@ -173,10 +166,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_endpgm
@@ -193,10 +185,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_endpgm
@@ -213,10 +204,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_endpgm
@@ -233,10 +223,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] clamp
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] clamp
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_endpgm
@@ -253,10 +242,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_endpgm
@@ -273,10 +261,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_endpgm
@@ -293,10 +280,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_endpgm
@@ -315,10 +301,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11]
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_endpgm
@@ -335,10 +320,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_endpgm
@@ -355,10 +339,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_endpgm
@@ -375,10 +358,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0]
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_endpgm
@@ -396,10 +378,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] clamp
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] clamp
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_endpgm
@@ -416,10 +397,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_endpgm
@@ -436,10 +416,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_endpgm
@@ -456,10 +435,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W32: ; %bb.0: ; %bb
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
-; W32-NEXT: s_clause 0x1
+; W32-NEXT: s_clause 0x3
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
-; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index 524a25cbc1e6d..08660578d2f51 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -25,6 +25,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_endpgm
@@ -43,6 +44,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_endpgm
@@ -61,6 +63,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_endpgm
@@ -77,6 +80,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_endpgm
@@ -95,6 +99,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_endpgm
@@ -111,6 +116,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_endpgm
@@ -129,6 +135,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11]
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_endpgm
@@ -146,6 +153,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_endpgm
@@ -162,6 +170,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_endpgm
@@ -178,6 +187,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_endpgm
@@ -194,6 +204,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_endpgm
@@ -210,6 +221,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_endpgm
@@ -226,6 +238,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_endpgm
@@ -242,6 +255,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_endpgm
@@ -260,6 +274,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7]
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_endpgm
@@ -276,6 +291,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_endpgm
@@ -292,6 +308,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_endpgm
@@ -308,6 +325,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0]
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_endpgm
@@ -324,6 +342,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_endpgm
@@ -340,6 +359,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_endpgm
@@ -356,6 +376,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_endpgm
@@ -372,6 +393,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
+; W64-NEXT: s_clause 0x1
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 1ca2a8ada68ea..8bdce9dc49060 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -265,11 +265,10 @@ define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-W64-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -303,11 +302,10 @@ define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-W64-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -342,11 +340,10 @@ define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-W64-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -382,11 +379,10 @@ define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-W64-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -424,11 +420,10 @@ define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX9-W64-NEXT: v_add_f32_e32 v1, v3, v4
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -461,11 +456,10 @@ define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-W64-NEXT: v_add_u32_e32 v1, v3, v4
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -875,11 +869,10 @@ define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX9-W64-NEXT: v_add_f32_e32 v1, v3, v4
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -914,11 +907,10 @@ define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-W64-NEXT: v_add_u32_e32 v1, v3, v4
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -1307,15 +1299,14 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
-; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
+; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-W64-NEXT: v_add_u32_e32 v1, v3, v2
; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_endpgm
;
@@ -2325,11 +2316,10 @@ define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX9-W64-NEXT: v_add_f32_e32 v1, v3, v4
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -2362,11 +2352,10 @@ define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
-; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
+; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-W64-NEXT: v_add_u32_e32 v1, v3, v4
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
@@ -3270,16 +3259,16 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
+; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
-; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
-; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
+; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21]
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3315,6 +3304,9 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
+; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
@@ -3322,9 +3314,6 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 00bb7b24786f5..76ef62f2b14cb 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -9,20 +9,20 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v1, v3, v1
-; SI-NEXT: v_xor_b32_e32 v0, v2, v0
+; SI-NEXT: v_xor_b32_e32 v1, v1, v3
+; SI-NEXT: v_xor_b32_e32 v0, v0, v2
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -58,22 +58,22 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v3, v7, v3
-; SI-NEXT: v_xor_b32_e32 v2, v6, v2
-; SI-NEXT: v_xor_b32_e32 v1, v5, v1
-; SI-NEXT: v_xor_b32_e32 v0, v4, v0
+; SI-NEXT: v_xor_b32_e32 v3, v3, v7
+; SI-NEXT: v_xor_b32_e32 v2, v2, v6
+; SI-NEXT: v_xor_b32_e32 v1, v1, v5
+; SI-NEXT: v_xor_b32_e32 v0, v0, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -111,23 +111,23 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0
+; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
-; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1
+; SI-NEXT: s_xor_b64 vcc, vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -364,20 +364,20 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s6
; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, v2, v0
-; SI-NEXT: v_xor_b32_e32 v1, v3, v1
+; SI-NEXT: v_xor_b32_e32 v0, v0, v2
+; SI-NEXT: v_xor_b32_e32 v1, v1, v3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
>From b453d6cb41360260a5aeb1c26000345c9b0cf83b Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Tue, 20 May 2025 03:32:10 -0500
Subject: [PATCH 2/6] Fix lit test failures, remove no-cluster test since we
now expect to cluster
---
.../branch-folding-implicit-def-subreg.ll | 50 ++--
llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 51 ----
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll | 279 ------------------
.../AMDGPU/splitkit-getsubrangeformask.ll | 154 +++++-----
4 files changed, 101 insertions(+), 433 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index d51e47bfb8d4f..297ce4e2543ea 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -560,8 +560,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc
- ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
- ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc
+ ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc
@@ -654,7 +654,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.48.bb63:
; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000)
- ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
+ ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0
; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc
@@ -668,7 +668,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.50.bb68:
; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
@@ -696,7 +696,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.52.bb80:
; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
@@ -710,7 +710,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1
- ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF
@@ -724,7 +724,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.54.bb73:
; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
@@ -756,22 +756,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.56.bb90:
; GFX90A-NEXT: successors: %bb.60(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $sgpr22, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr21, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
- ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
- ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr11, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec
; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec
; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.60
; GFX90A-NEXT: {{ $}}
@@ -815,13 +815,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr1 = COPY renamable $sgpr23, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr21, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr5 = COPY killed renamable $sgpr33, implicit $exec
; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
- ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
- ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
- ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr1, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr4, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3)
; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1
@@ -829,14 +829,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.59.bb85:
; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec
; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec
; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec
- ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
@@ -850,20 +850,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.60.Flow31:
; GFX90A-NEXT: successors: %bb.61(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.61.Flow30:
; GFX90A-NEXT: successors: %bb.55(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr52_sgpr53, implicit-def dead $scc
; GFX90A-NEXT: S_BRANCH %bb.55
; GFX90A-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index 7ac0b81a67d81..eb762c9ad22de 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -331,57 +331,6 @@ entry:
ret void
}
-; Don't cluster loads from different textures
-; DBG-LABEL: no_cluster_image_load:
-; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: LocationSize::precise(16)
-; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: LocationSize::precise(16)
-; DBG-NOT: {{^}}Cluster ld/st
-define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> inreg %src2, <8 x i32> inreg %dst, i32 %x, i32 %y) {
-; GFX9-LABEL: no_cluster_image_load:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: image_load_mip v[3:6], v[0:2], s[0:7] dmask:0xf unorm
-; GFX9-NEXT: image_load_mip v[7:10], v[0:2], s[8:15] dmask:0xf unorm
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v6, v6, v10
-; GFX9-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT: image_store v[3:6], v[0:1], s[16:23] dmask:0xf unorm
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: no_cluster_image_load:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_mov_b32_e32 v10, 0
-; GFX10-NEXT: image_load_mip v[2:5], [v0, v1, v10], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX10-NEXT: image_load_mip v[6:9], [v0, v1, v10], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: no_cluster_image_load:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: v_mov_b32_e32 v6, 0
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_load_mip v[2:5], [v0, v1, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT: image_load_mip v[6:9], [v0, v1, v6], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6
-; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT: s_endpgm
-entry:
- %val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0)
- %val2 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src2, i32 0, i32 0)
- %val = fadd fast <4 x float> %val1, %val2
- call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %val, i32 15, i32 %x, i32 %y, <8 x i32> %dst, i32 0, i32 0)
- ret void
-}
-
; Cluster loads from the same texture and sampler with different coordinates
; DBG-LABEL: cluster_image_sample:
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: LocationSize::precise(16)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
index a43c0ee749847..010e0faf906e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
@@ -74,9 +74,6 @@ define i32 @dead_i32(i1 %cond, i32 %x, ptr addrspace(1) %ptr1) #0 {
; ASM-GISEL-FAKE16-NEXT: s_wait_alu 0xfffe
; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; ASM-DAG: ; %bb.0: ; %entry
-; ASM-DAG: ; %bb.0: ; %entry
-; ASM-DAG: ; %bb.0: ; %entry
entry:
%dead = call i32 @llvm.amdgcn.dead.i32()
br i1 %cond, label %if.then, label %if.end
@@ -224,53 +221,6 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1)
; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14
; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16
; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; ASM-DAG: ; %bb.0: ; %entry
-; ASM-DAG: ; %bb.0: ; %entry
-; ASM-DAG: ; %bb.0: ; %entry
-; ASM-GISEL-LABEL: dead_struct:
-; ASM-GISEL: ; %bb.0: ; %entry
-; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; ASM-GISEL-NEXT: s_wait_expcnt 0x0
-; ASM-GISEL-NEXT: s_wait_samplecnt 0x0
-; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0
-; ASM-GISEL-NEXT: s_wait_kmcnt 0x0
-; ASM-GISEL-NEXT: v_mov_b32_e32 v20, v0
-; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2
-; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; ASM-GISEL-NEXT: v_and_b32_e32 v2, 1, v20
-; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v2
-; ASM-GISEL-NEXT: s_cbranch_execz .LBB1_2
-; ASM-GISEL-NEXT: ; %bb.1: ; %if.then
-; ASM-GISEL-NEXT: s_mov_b32 s4, 0
-; ASM-GISEL-NEXT: s_mov_b32 s1, 0x3fc00000
-; ASM-GISEL-NEXT: s_wait_alu 0xfffe
-; ASM-GISEL-NEXT: s_mov_b32 s7, s4
-; ASM-GISEL-NEXT: s_mov_b32 s5, s4
-; ASM-GISEL-NEXT: s_mov_b32 s6, s4
-; ASM-GISEL-NEXT: s_wait_alu 0xfffe
-; ASM-GISEL-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v13, s6
-; ASM-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19
-; ASM-GISEL-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr8
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr15
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr16
-; ASM-GISEL-NEXT: global_store_b32 v[17:18], v0, off
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr0
-; ASM-GISEL-NEXT: .LBB1_2: ; %if.end
-; ASM-GISEL-NEXT: s_wait_alu 0xfffe
-; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
-; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
-; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8
-; ASM-GISEL-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10
-; ASM-GISEL-NEXT: v_dual_mov_b32 v10, v11 :: v_dual_mov_b32 v11, v12
-; ASM-GISEL-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14
-; ASM-GISEL-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16
-; ASM-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
br i1 %cond, label %if.then, label %if.end
@@ -521,86 +471,6 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i
; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; ASM-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0
; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; ASM-DAG: ; %bb.0: ; %entry
-; ASM-DAG: ; %bb.0: ; %entry
-; ASM-DAG: ; %bb.0: ; %entry
-; ASM-GISEL-LABEL: dead_array:
-; ASM-GISEL: ; %bb.0: ; %entry
-; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; ASM-GISEL-NEXT: s_wait_expcnt 0x0
-; ASM-GISEL-NEXT: s_wait_samplecnt 0x0
-; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0
-; ASM-GISEL-NEXT: s_wait_kmcnt 0x0
-; ASM-GISEL-NEXT: v_mov_b32_e32 v32, v0
-; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2
-; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
-; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
-; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8
-; ASM-GISEL-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10
-; ASM-GISEL-NEXT: v_dual_mov_b32 v10, v11 :: v_dual_mov_b32 v11, v12
-; ASM-GISEL-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14
-; ASM-GISEL-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16
-; ASM-GISEL-NEXT: v_dual_mov_b32 v16, v17 :: v_dual_mov_b32 v17, v18
-; ASM-GISEL-NEXT: v_dual_mov_b32 v18, v19 :: v_dual_mov_b32 v19, v20
-; ASM-GISEL-NEXT: v_dual_mov_b32 v20, v21 :: v_dual_mov_b32 v21, v22
-; ASM-GISEL-NEXT: v_dual_mov_b32 v22, v23 :: v_dual_mov_b32 v23, v24
-; ASM-GISEL-NEXT: v_dual_mov_b32 v24, v25 :: v_dual_mov_b32 v25, v26
-; ASM-GISEL-NEXT: v_dual_mov_b32 v26, v27 :: v_dual_mov_b32 v27, v28
-; ASM-GISEL-NEXT: v_dual_mov_b32 v28, v29 :: v_dual_mov_b32 v29, v30
-; ASM-GISEL-NEXT: s_clause 0x4
-; ASM-GISEL-NEXT: scratch_load_b32 v30, off, s32
-; ASM-GISEL-NEXT: scratch_load_b32 v31, off, s32 offset:4
-; ASM-GISEL-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; ASM-GISEL-NEXT: scratch_load_b32 v34, off, s32 offset:12
-; ASM-GISEL-NEXT: scratch_load_b32 v35, off, s32 offset:16
-; ASM-GISEL-NEXT: v_and_b32_e32 v32, 1, v32
-; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v32
-; ASM-GISEL-NEXT: s_cbranch_execz .LBB2_2
-; ASM-GISEL-NEXT: ; %bb.1: ; %if.then
-; ASM-GISEL-NEXT: s_mov_b32 s1, 15
-; ASM-GISEL-NEXT: s_mov_b32 s2, 13
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x0
-; ASM-GISEL-NEXT: s_wait_alu 0xfffe
-; ASM-GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_add_nc_u32 v0, 15, v35
-; ASM-GISEL-NEXT: v_mov_b32_e32 v6, s2
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr1
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr2
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr3
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr4
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr5
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr8
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr9
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr10
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr11
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr12
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr13
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr14
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr15
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr16
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr17
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr18
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr19
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr20
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr21
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr22
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr23
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr24
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr25
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr26
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr27
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr28
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr29
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr30
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr31
-; ASM-GISEL-NEXT: global_store_b32 v[33:34], v0, off
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr0
-; ASM-GISEL-NEXT: .LBB2_2: ; %if.end
-; ASM-GISEL-NEXT: s_wait_alu 0xfffe
-; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x0
-; ASM-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
br i1 %cond, label %if.then, label %if.end
@@ -921,155 +791,6 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr
; ASM-GISEL-FAKE16-NEXT: scratch_store_b32 v0, v67, off offset:204
; ASM-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0
; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; ASM-GISEL-LABEL: dead_non_trivial:
-; ASM-GISEL: ; %bb.0: ; %entry
-; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; ASM-GISEL-NEXT: s_wait_expcnt 0x0
-; ASM-GISEL-NEXT: s_wait_samplecnt 0x0
-; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0
-; ASM-GISEL-NEXT: s_wait_kmcnt 0x0
-; ASM-GISEL-NEXT: s_clause 0x15
-; ASM-GISEL-NEXT: scratch_load_b32 v33, off, s32
-; ASM-GISEL-NEXT: scratch_load_b32 v34, off, s32 offset:4
-; ASM-GISEL-NEXT: scratch_load_b32 v35, off, s32 offset:8
-; ASM-GISEL-NEXT: scratch_load_b32 v36, off, s32 offset:12
-; ASM-GISEL-NEXT: scratch_load_b32 v37, off, s32 offset:16
-; ASM-GISEL-NEXT: scratch_load_b32 v38, off, s32 offset:20
-; ASM-GISEL-NEXT: scratch_load_b32 v39, off, s32 offset:24
-; ASM-GISEL-NEXT: scratch_load_b32 v48, off, s32 offset:28
-; ASM-GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:32
-; ASM-GISEL-NEXT: scratch_load_b32 v50, off, s32 offset:36
-; ASM-GISEL-NEXT: scratch_load_b32 v51, off, s32 offset:40
-; ASM-GISEL-NEXT: scratch_load_b32 v52, off, s32 offset:44
-; ASM-GISEL-NEXT: scratch_load_b32 v53, off, s32 offset:48
-; ASM-GISEL-NEXT: scratch_load_b32 v54, off, s32 offset:52
-; ASM-GISEL-NEXT: scratch_load_b32 v55, off, s32 offset:56
-; ASM-GISEL-NEXT: scratch_load_b32 v64, off, s32 offset:60
-; ASM-GISEL-NEXT: scratch_load_b32 v65, off, s32 offset:64
-; ASM-GISEL-NEXT: scratch_load_b32 v66, off, s32 offset:68
-; ASM-GISEL-NEXT: scratch_load_b32 v67, off, s32 offset:72
-; ASM-GISEL-NEXT: scratch_load_b32 v31, off, s32 offset:76
-; ASM-GISEL-NEXT: scratch_load_b32 v32, off, s32 offset:80
-; ASM-GISEL-NEXT: scratch_load_b32 v68, off, s32 offset:84
-; ASM-GISEL-NEXT: v_and_b32_e32 v1, 1, v1
-; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v1
-; ASM-GISEL-NEXT: s_cbranch_execz .LBB3_2
-; ASM-GISEL-NEXT: ; %bb.1: ; %if.then
-; ASM-GISEL-NEXT: s_mov_b32 s1, 0
-; ASM-GISEL-NEXT: s_movk_i32 s2, 0x3e00
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x0
-; ASM-GISEL-NEXT: s_wait_alu 0xfffe
-; ASM-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_add_nc_u32 v1, 15, v68
-; ASM-GISEL-NEXT: v_mov_b32_e32 v8, s1
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr2
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr3
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr5
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr6
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr7
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12_vgpr13
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr18
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr19
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr20
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr21
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr22
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr23
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr24
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr25
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr26
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr27
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr28
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr29
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr30
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr33
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr34
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr35
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr36
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr37
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr38
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr39
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr48
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr49
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr50
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr51
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr52
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr53
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr54
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr55
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr64
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr65
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr66
-; ASM-GISEL-NEXT: ; implicit-def: $vgpr67
-; ASM-GISEL-NEXT: global_store_b32 v[31:32], v1, off
-; ASM-GISEL-NEXT: .LBB3_2: ; %if.end
-; ASM-GISEL-NEXT: s_wait_alu 0xfffe
-; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; ASM-GISEL-NEXT: s_clause 0x16
-; ASM-GISEL-NEXT: scratch_store_b8 v0, v2, off
-; ASM-GISEL-NEXT: scratch_store_b16 v0, v3, off offset:2
-; ASM-GISEL-NEXT: scratch_store_b16 v0, v4, off offset:4
-; ASM-GISEL-NEXT: scratch_store_b16 v0, v5, off offset:6
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v6, off offset:8
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v7, off offset:12
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v8, off offset:16
-; ASM-GISEL-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v13, off offset:48
-; ASM-GISEL-NEXT: scratch_store_b128 v0, v[14:17], off offset:64
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v18, off offset:80
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v19, off offset:84
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v20, off offset:88
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v21, off offset:92
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v22, off offset:96
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v23, off offset:100
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v24, off offset:104
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v25, off offset:108
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v26, off offset:112
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v27, off offset:116
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v28, off offset:120
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v29, off offset:124
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v30, off offset:128
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x15
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v33, off offset:132
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x14
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v34, off offset:136
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x13
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v35, off offset:140
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x12
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v36, off offset:144
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x11
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v37, off offset:148
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x10
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v38, off offset:152
-; ASM-GISEL-NEXT: s_wait_loadcnt 0xf
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v39, off offset:156
-; ASM-GISEL-NEXT: s_wait_loadcnt 0xe
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v48, off offset:160
-; ASM-GISEL-NEXT: s_wait_loadcnt 0xd
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v49, off offset:164
-; ASM-GISEL-NEXT: s_wait_loadcnt 0xc
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v50, off offset:168
-; ASM-GISEL-NEXT: s_wait_loadcnt 0xb
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v51, off offset:172
-; ASM-GISEL-NEXT: s_wait_loadcnt 0xa
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v52, off offset:176
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x9
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v53, off offset:180
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x8
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v54, off offset:184
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x7
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v55, off offset:188
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x6
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v64, off offset:192
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x5
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v65, off offset:196
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x4
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v66, off offset:200
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x3
-; ASM-GISEL-NEXT: scratch_store_b32 v0, v67, off offset:204
-; ASM-GISEL-NEXT: s_wait_loadcnt 0x0
-; ASM-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
br i1 %cond, label %if.then, label %if.end
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index b7e6ebaa655b9..43322e6f33685 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -21,8 +21,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr18
- ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19
+ ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr18
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr20
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr21
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr22
@@ -36,47 +36,46 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY6]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc
; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: KILL undef %74:sreg_64
; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: KILL undef %89:sgpr_128
- ; CHECK-NEXT: KILL undef %118:sgpr_128
; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], undef %169:sreg_32, implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, undef %169:sreg_32, implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
@@ -90,21 +89,21 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
@@ -114,58 +113,59 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -217, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -233, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
- ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -313, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -329, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -345, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
@@ -198,9 +198,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
- ; CHECK-NEXT: KILL undef %470:sreg_64
; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
@@ -219,10 +217,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -475, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -491, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -507, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -539, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
@@ -240,13 +238,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
- ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_U32_e64_]], [[V_ADD_U32_e64_1]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -4, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_U32_e64_1]], [[V_ADD_U32_e64_]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -4, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[V_ADD_U32_e64_2]], implicit $exec
- ; CHECK-NEXT: [[V_SUBREV_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 27, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBREV_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 27, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_1]], [[V_ADD_U32_e64_3]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 28, [[BUFFER_LOAD_DWORD_OFFSET]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_2]], [[V_SUBREV_U32_e64_]], implicit $exec
@@ -265,7 +263,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_10]], [[V_SUBREV_U32_e64_5]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 38, [[BUFFER_LOAD_FORMAT_X_IDXEN7]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec
- ; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec
; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec
@@ -279,13 +277,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_18:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_17]], [[V_SUBREV_U32_e64_12]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -73, [[BUFFER_LOAD_FORMAT_X_IDXEN13]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_19:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_18]], [[V_ADD_U32_e64_4]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -74, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -74, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_20:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_19]], [[V_ADD_U32_e64_5]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -75, [[BUFFER_LOAD_FORMAT_X_IDXEN14]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_21:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_20]], [[V_ADD_U32_e64_6]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -77, [[BUFFER_LOAD_FORMAT_X_IDXEN15]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_22:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_21]], [[V_ADD_U32_e64_7]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -93, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -93, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_23:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_22]], [[V_ADD_U32_e64_8]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -94, [[BUFFER_LOAD_FORMAT_X_IDXEN16]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_24:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_23]], [[V_ADD_U32_e64_9]], implicit $exec
@@ -307,7 +305,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_34:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_33]], [[V_ADD_U32_e64_15]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -197, [[BUFFER_LOAD_FORMAT_X_IDXEN20]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_35:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_34]], [[V_ADD_U32_e64_16]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec
@@ -317,9 +315,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_47:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_46]], [[V_ADD_U32_e64_20]], implicit $exec
@@ -340,16 +338,16 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -557, [[BUFFER_LOAD_FORMAT_X_IDXEN25]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_60:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_59]], [[V_ADD_U32_e64_24]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -574, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -574, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_60]], [[V_ADD_U32_e64_25]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_61]], [[V_ADD_U32_e64_26]], implicit $exec
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_62]], [[V_ADD_U32_e64_27]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
>From 608b0025b1ec99f9d612201864e499260e93ec00 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Tue, 20 May 2025 03:36:10 -0500
Subject: [PATCH 3/6] NFC formatting
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8b19ab35bc822..43731c4b6f1db 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -48,10 +48,9 @@ namespace llvm::AMDGPU {
} // namespace llvm::AMDGPU
static cl::opt<bool> DisableDiffBasePtrMemClustering(
- "amdgpu-disable-diff-baseptr-mem-clustering",
- cl::desc("Disable clustering memory ops with different base pointers"),
- cl::init(false),
- cl::Hidden);
+ "amdgpu-disable-diff-baseptr-mem-clustering",
+ cl::desc("Disable clustering memory ops with different base pointers"),
+ cl::init(false), cl::Hidden);
// Must be at least 4 to be able to branch over minimum unconditional branch
// code. This is only for making it possible to write reasonably small tests for
@@ -529,9 +528,9 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
}
static bool memOpsHaveSameAddrspace(const MachineInstr &MI1,
- ArrayRef<const MachineOperand *> BaseOps1,
- const MachineInstr &MI2,
- ArrayRef<const MachineOperand *> BaseOps2) {
+ ArrayRef<const MachineOperand *> BaseOps1,
+ const MachineInstr &MI2,
+ ArrayRef<const MachineOperand *> BaseOps2) {
// If base is identical, assume identical addrspace
if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
return true;
@@ -585,14 +584,14 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
if (!BaseOps1.empty() && !BaseOps2.empty()) {
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
-
+
if (!DisableDiffBasePtrMemClustering) {
// Only consider memory ops from same addrspace for clustering
if (!memOpsHaveSameAddrspace(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
} else {
- // If the mem ops (to be clustered) do not have the same base ptr, then they
- // should not be clustered
+ // If the mem ops (to be clustered) do not have the same base ptr, then
+ // they should not be clustered
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
}
>From 8dba56cdfd2027b01635cb944addfb54c012df2a Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 21 May 2025 17:52:47 -0500
Subject: [PATCH 4/6] Addressed feedback: Change to Enable, Reject scalar and
vector clustering
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 24 +-
...licit-kernarg-backend-usage-global-isel.ll | 8 +-
.../AMDGPU/agpr-copy-no-free-registers.ll | 50 +--
llvm/test/CodeGen/AMDGPU/idot2.ll | 374 +++++++++---------
llvm/test/CodeGen/AMDGPU/idot4s.ll | 138 +++----
llvm/test/CodeGen/AMDGPU/idot4u.ll | 226 +++++------
llvm/test/CodeGen/AMDGPU/idot8s.ll | 54 +--
llvm/test/CodeGen/AMDGPU/idot8u.ll | 60 +--
.../AMDGPU/implicit-kernarg-backend-usage.ll | 8 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 6 +-
10 files changed, 483 insertions(+), 465 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 43731c4b6f1db..d6e646c8ffc15 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -47,10 +47,10 @@ namespace llvm::AMDGPU {
#include "AMDGPUGenSearchableTables.inc"
} // namespace llvm::AMDGPU
-static cl::opt<bool> DisableDiffBasePtrMemClustering(
- "amdgpu-disable-diff-baseptr-mem-clustering",
- cl::desc("Disable clustering memory ops with different base pointers"),
- cl::init(false), cl::Hidden);
+static cl::opt<bool> EnableDiffBasePtrMemClustering(
+ "amdgpu-enable-diff-baseptr-mem-clustering",
+ cl::desc("Enable clustering memory ops with different base pointers"),
+ cl::init(true), cl::Hidden);
// Must be at least 4 to be able to branch over minimum unconditional branch
// code. This is only for making it possible to write reasonably small tests for
@@ -585,10 +585,24 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
- if (!DisableDiffBasePtrMemClustering) {
+ if (EnableDiffBasePtrMemClustering) {
// Only consider memory ops from same addrspace for clustering
if (!memOpsHaveSameAddrspace(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
+
+ // Don't cluster scalar and vecter memory ops
+ const MachineFunction &MF = *FirstLdSt.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (FirstLdSt.getOperand(0).isReg() &&
+ SecondLdSt.getOperand(0).isReg()) {
+ bool isFirstVecReg = RI.isVectorRegister(MRI,
+ FirstLdSt.getOperand(0).getReg());
+ bool isSecondVecReg = RI.isVectorRegister(MRI,
+ SecondLdSt.getOperand(0).getReg());
+ if ((isFirstVecReg && !isSecondVecReg) ||
+ (!isFirstVecReg && isSecondVecReg))
+ return false;
+ }
} else {
// If the mem ops (to be clustered) do not have the same base ptr, then
// they should not be clustered
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 89f896a2b1656..86766e2904619 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -288,16 +288,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s10
+; GFX8V4-NEXT: v_mov_b32_e32 v1, s11
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v3, s1
-; GFX8V4-NEXT: v_mov_b32_e32 v1, s11
; GFX8V4-NEXT: v_mov_b32_e32 v2, s0
; GFX8V4-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
@@ -314,16 +314,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s10
+; GFX8V5-NEXT: v_mov_b32_e32 v1, s11
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v3, s1
-; GFX8V5-NEXT: v_mov_b32_e32 v1, s11
; GFX8V5-NEXT: v_mov_b32_e32 v2, s0
; GFX8V5-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 5fe362616e67d..3160e38df5e3f 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -513,16 +513,16 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-LABEL: introduced_copy_to_sgpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc
-; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18
; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX908-NEXT: s_mov_b32 s8, 0
-; GFX908-NEXT: s_mov_b32 s13, s8
-; GFX908-NEXT: v_mov_b32_e32 v19, 0
+; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18
+; GFX908-NEXT: s_mov_b32 s12, 0
+; GFX908-NEXT: s_mov_b32 s9, s12
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX908-NEXT: s_sub_i32 s1, 0, s7
; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s0
+; GFX908-NEXT: v_mov_b32_e32 v19, 0
; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, 0
; GFX908-NEXT: v_mov_b32_e32 v1, 0
@@ -542,14 +542,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_cselect_b32 s2, s6, s2
; GFX908-NEXT: s_add_i32 s3, s1, 1
; GFX908-NEXT: s_cmp_ge_u32 s2, s7
-; GFX908-NEXT: s_cselect_b32 s12, s3, s1
+; GFX908-NEXT: s_cselect_b32 s8, s3, s1
; GFX908-NEXT: s_lshr_b32 s2, s0, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2
; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GFX908-NEXT: s_or_b32 s14, s14, 28
-; GFX908-NEXT: s_lshl_b64 s[16:17], s[12:13], 5
+; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s2, v16
; GFX908-NEXT: s_and_b32 s2, 0xffff, s2
@@ -573,15 +573,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; GFX908-NEXT: s_mov_b32 s9, s8
+; GFX908-NEXT: s_mov_b32 s13, s12
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: v_mov_b32_e32 v4, s12
; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6
-; GFX908-NEXT: v_mov_b32_e32 v6, s8
-; GFX908-NEXT: v_mov_b32_e32 v8, s8
-; GFX908-NEXT: v_mov_b32_e32 v5, s9
-; GFX908-NEXT: v_mov_b32_e32 v7, s9
-; GFX908-NEXT: v_mov_b32_e32 v9, s9
+; GFX908-NEXT: v_mov_b32_e32 v6, s12
+; GFX908-NEXT: v_mov_b32_e32 v8, s12
+; GFX908-NEXT: v_mov_b32_e32 v5, s13
+; GFX908-NEXT: v_mov_b32_e32 v7, s13
+; GFX908-NEXT: v_mov_b32_e32 v9, s13
; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
; GFX908-NEXT: v_mov_b32_e32 v11, v5
; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
@@ -667,7 +667,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: ; %bb.11: ; %bb12
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_add_u32 s10, s10, s12
+; GFX908-NEXT: s_add_u32 s10, s10, s8
; GFX908-NEXT: s_addc_u32 s11, s11, 0
; GFX908-NEXT: s_add_u32 s14, s14, s16
; GFX908-NEXT: s_addc_u32 s15, s15, s17
@@ -679,15 +679,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-LABEL: introduced_copy_to_sgpr:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc
-; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
-; GFX90A-NEXT: s_mov_b32 s8, 0
-; GFX90A-NEXT: s_mov_b32 s13, s8
-; GFX90A-NEXT: v_mov_b32_e32 v19, 0
+; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18
+; GFX90A-NEXT: s_mov_b32 s12, 0
+; GFX90A-NEXT: s_mov_b32 s9, s12
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX90A-NEXT: s_sub_i32 s1, 0, s7
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
@@ -707,14 +707,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_cselect_b32 s2, s6, s2
; GFX90A-NEXT: s_add_i32 s3, s1, 1
; GFX90A-NEXT: s_cmp_ge_u32 s2, s7
-; GFX90A-NEXT: s_cselect_b32 s12, s3, s1
+; GFX90A-NEXT: s_cselect_b32 s8, s3, s1
; GFX90A-NEXT: s_lshr_b32 s2, s0, 16
; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s2
; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GFX90A-NEXT: s_or_b32 s14, s14, 28
-; GFX90A-NEXT: s_lshl_b64 s[16:17], s[12:13], 5
+; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s2, v18
; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2
@@ -738,12 +738,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; GFX90A-NEXT: s_mov_b32 s9, s8
+; GFX90A-NEXT: s_mov_b32 s13, s12
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15]
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -821,7 +821,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: ; %bb.11: ; %bb12
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT: s_add_u32 s10, s10, s12
+; GFX90A-NEXT: s_add_u32 s10, s10, s8
; GFX90A-NEXT: s_addc_u32 s11, s11, 0
; GFX90A-NEXT: s_add_u32 s14, s14, s16
; GFX90A-NEXT: s_addc_u32 s15, s15, s17
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index b064689f25c9d..88276e46f355a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -22,9 +22,9 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -73,13 +73,13 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -91,12 +91,12 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -108,13 +108,13 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -161,9 +161,9 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -190,14 +190,14 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX8-NEXT: v_mul_u32_u24_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
@@ -211,13 +211,13 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -229,13 +229,13 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -249,13 +249,13 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0
@@ -301,9 +301,9 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -352,13 +352,13 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -370,12 +370,12 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -387,13 +387,13 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -437,9 +437,9 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -488,13 +488,13 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -506,13 +506,13 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -526,13 +526,13 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -579,9 +579,9 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -652,12 +652,12 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -669,13 +669,13 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -719,9 +719,9 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -770,13 +770,13 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -788,13 +788,13 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -808,13 +808,13 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -861,9 +861,9 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
@@ -1054,13 +1054,13 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1072,12 +1072,12 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1089,13 +1089,13 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1139,9 +1139,9 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
@@ -1194,13 +1194,13 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] offset:4
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1] offset:4
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1212,12 +1212,12 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1229,13 +1229,13 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1331,13 +1331,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0
; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7]
@@ -1349,13 +1349,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0
; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7]
@@ -1369,13 +1369,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -1474,13 +1474,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0
; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7]
@@ -1492,13 +1492,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0
; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7]
@@ -1512,13 +1512,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -1565,9 +1565,9 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -1616,13 +1616,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1634,13 +1634,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1654,13 +1654,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
@@ -1707,9 +1707,9 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -1760,16 +1760,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
@@ -1780,16 +1780,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -1860,9 +1860,9 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -1913,16 +1913,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, s0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
@@ -1933,16 +1933,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -2013,9 +2013,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -2172,9 +2172,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -2331,9 +2331,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
@@ -2384,17 +2384,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v1, v2
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
@@ -2405,17 +2405,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v1, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -2488,9 +2488,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
@@ -2541,17 +2541,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v1, v2
+; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, s0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
@@ -2562,17 +2562,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v1, v2
+; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -2785,9 +2785,9 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
@@ -2838,15 +2838,15 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 3491785a9c5dc..9f792c84919b2 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -21,19 +21,19 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
-; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, s8
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, s4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v6, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
@@ -84,15 +84,15 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
@@ -105,12 +105,12 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -123,14 +123,14 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v2, v1
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -144,11 +144,11 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -590,20 +590,20 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
-; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v8, v1, v5, s8
+; GFX7-NEXT: v_mad_i32_i24 v8, v1, v5, s4
; GFX7-NEXT: v_mad_i32_i24 v3, v3, v6, v8
+; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
@@ -797,9 +797,9 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 24, v2
@@ -812,7 +812,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s8
+; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
; GFX7-NEXT: v_mad_i32_i24 v0, v4, v7, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v6, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v1, v5, v0
@@ -886,12 +886,12 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -904,14 +904,14 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v2, v1
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -925,11 +925,11 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1284,9 +1284,9 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
@@ -1335,13 +1335,13 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1454,19 +1454,19 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
-; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v4, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
+; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
@@ -1641,19 +1641,19 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 24, v2
; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
-; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_ashrrev_i32_e32 v4, 24, v0
; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
+; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
@@ -1998,21 +1998,21 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v1, s0
; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8
@@ -2207,21 +2207,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v1, s0
; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
@@ -2399,7 +2399,6 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dword s12, s[4:5], 0xf
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s10, 0
; GFX7-NEXT: s_mov_b32 s11, s7
@@ -2408,19 +2407,21 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; GFX7-NEXT: s_sext_i32_i16 s1, s12
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8
-; GFX7-NEXT: v_mad_i32_i24 v1, v3, s1, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8
+; GFX7-NEXT: v_mad_i32_i24 v1, v3, s1, v1
; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v4, v5, v1
@@ -2623,20 +2624,20 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
-; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v4, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -2813,11 +2814,11 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
@@ -3004,31 +3005,32 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_4src:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b32 s18, 0
+; GFX7-NEXT: s_mov_b32 s19, s3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[12:13]
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[14:15]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11]
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[16:17], s[12:13]
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[16:17], s[14:15]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s8
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v5, v3, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v2, v4, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v4, 8, 8
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 84f9c6c7ea5c7..3ebee71515c68 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -21,19 +21,19 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
-; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s8
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
@@ -84,15 +84,15 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
@@ -105,12 +105,12 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -122,13 +122,13 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -142,11 +142,11 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1093,20 +1093,20 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
-; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s8
+; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4
; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
@@ -1300,25 +1300,25 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s8
+; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s4
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, s8, v3
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, s4, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1367,17 +1367,17 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_bfe_u32 v5, v1, 8, 8
+; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_bfe_u32 v5, v2, 8, 8
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0
; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2
@@ -1392,15 +1392,15 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_add_i32 s1, s0, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
; GFX9-DL-NEXT: v_add3_u32 v1, s1, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -1413,14 +1413,14 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
; GFX10-DL-NEXT: s_add_i32 s0, s0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
@@ -2090,9 +2090,9 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
@@ -2105,7 +2105,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s8
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0
@@ -2153,15 +2153,15 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
@@ -2174,12 +2174,12 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -2191,13 +2191,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -2211,11 +2211,11 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -2833,9 +2833,9 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
@@ -2884,13 +2884,13 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -3001,19 +3001,19 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
+; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
@@ -3186,19 +3186,19 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
+; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
@@ -3542,21 +3542,21 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8
@@ -3751,21 +3751,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
@@ -3944,7 +3944,6 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dword s12, s[4:5], 0xf
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s10, 0
; GFX7-NEXT: s_mov_b32 s11, s7
@@ -3953,19 +3952,21 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; GFX7-NEXT: s_and_b32 s1, s12, 0xffff
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, s1, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v3, s1, v1
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v5, v1
@@ -4168,20 +4169,20 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -4357,11 +4358,11 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3]
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX7-NEXT: s_mov_b32 s10, -1
; GFX7-NEXT: s_mov_b32 s8, s6
; GFX7-NEXT: s_mov_b32 s9, s7
@@ -4547,31 +4548,32 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_4src:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b32 s18, 0
+; GFX7-NEXT: s_mov_b32 s19, s3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9]
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[12:13]
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[14:15]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11]
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[16:17], s[12:13]
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[16:17], s[14:15]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s8
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v3
; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v4
; GFX7-NEXT: v_bfe_u32 v4, v4, 8, 8
@@ -4884,8 +4886,8 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NODL-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
@@ -6163,17 +6165,17 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -6188,11 +6190,11 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
@@ -6231,17 +6233,17 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0500
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0100
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s1
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s1
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -6253,13 +6255,13 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc0c0500
-; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100
+; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0500
+; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
@@ -6276,15 +6278,15 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_perm_b32 v1, v1, v0, 0xc0c0500
-; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
+; GFX11-DL-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0500
+; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 4201f9c238970..e94959b39ad35 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -26,20 +26,20 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s8
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
@@ -172,8 +172,8 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
@@ -181,7 +181,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -199,13 +199,13 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
-; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v2, v1, s0
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@@ -223,11 +223,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v0, v1, s0
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1364,21 +1364,21 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s8
+; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4
; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16
+; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
@@ -1755,9 +1755,9 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
@@ -1779,7 +1779,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s8
+; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0
@@ -1802,11 +1802,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
@@ -1901,8 +1901,8 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
@@ -1910,7 +1910,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1928,13 +1928,13 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
-; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v2, v1, s0
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@@ -1952,11 +1952,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v0, v1, s0
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 00c49cdecefb6..97b5481a50caf 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -24,9 +24,9 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
@@ -48,7 +48,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s8
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -71,11 +71,11 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
@@ -170,8 +170,8 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
@@ -179,7 +179,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -197,13 +197,13 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1541,9 +1541,9 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
@@ -1565,7 +1565,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s8
+; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s4
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
; GFX7-NEXT: v_mad_u32_u24 v2, v8, v15, v16
; GFX7-NEXT: v_mad_u32_u24 v2, v7, v14, v2
@@ -1590,11 +1590,11 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
@@ -1879,9 +1879,9 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
@@ -1903,7 +1903,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s8
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -1926,11 +1926,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s15, 0xe80000
; GFX8-NEXT: s_add_u32 s12, s12, s11
@@ -2025,8 +2025,8 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
@@ -2034,7 +2034,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -2052,13 +2052,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -3121,16 +3121,16 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 15, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 4, 4
-; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v9, 15, v0
+; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4
@@ -3144,7 +3144,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX7-NEXT: v_bfe_u32 v15, v0, 24, 4
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 28, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v9, v1, s8
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v1, s4
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v10, v3, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v11, v4, v0
@@ -3252,12 +3252,12 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -3269,13 +3269,13 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %v2addr,
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 4a8967db765c0..ec80efc5f0362 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -295,16 +295,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: v_mov_b32_e32 v2, s10
+; GFX8V4-NEXT: v_mov_b32_e32 v3, s11
; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
-; GFX8V4-NEXT: v_mov_b32_e32 v3, s11
; GFX8V4-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: s_endpgm
@@ -320,16 +320,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: v_mov_b32_e32 v2, s10
+; GFX8V5-NEXT: v_mov_b32_e32 v3, s11
; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
-; GFX8V5-NEXT: v_mov_b32_e32 v3, s11
; GFX8V5-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 9df995b5a7066..da474d3889413 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4706,10 +4706,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
; SI-LABEL: packed_struct_argument_alignment:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
@@ -4754,11 +4754,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: flat_load_ubyte v8, v[0:1]
; VI-NEXT: flat_load_ubyte v9, v[2:3]
; VI-NEXT: flat_load_ubyte v10, v[4:5]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
-; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
>From 952d5f7015256aae08e49577b6cfbf5c9a06fd32 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 21 May 2025 19:00:04 -0500
Subject: [PATCH 5/6] Add flag test .ll file. Couldn't get a contrived .mir
testcase to not cluster
---
.../test-enable-diffbase-clustering-flag.ll | 117 ++++++++++++++++++
1 file changed, 117 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/test-enable-diffbase-clustering-flag.ll
diff --git a/llvm/test/CodeGen/AMDGPU/test-enable-diffbase-clustering-flag.ll b/llvm/test/CodeGen/AMDGPU/test-enable-diffbase-clustering-flag.ll
new file mode 100644
index 0000000000000..9a82b5727ba37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/test-enable-diffbase-clustering-flag.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -amdgpu-enable-diff-baseptr-mem-clustering=false < %s | FileCheck -check-prefixes=GFX10N %s
+define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %i21, ptr addrspace(1) nocapture noundef writeonly align 4 %arg, i32 noundef %arg1) #1 {
+; GFX10-LABEL: compute_mad:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x4
+; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: v_mul_lo_u32 v1, s6, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v2, s6, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX10-NEXT: v_mul_lo_u32 v2, v1, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v3
+; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v1
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s8, s2, v[0:1]
+; GFX10-NEXT: v_mul_lo_u32 v1, v3, v2
+; GFX10-NEXT: v_add_co_u32 v2, s0, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s1, 0, s0
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2]
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5]
+; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, s4, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v2, null, s5, v3, vcc_lo
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX10N-LABEL: compute_mad:
+; GFX10N: ; %bb.0: ; %bb
+; GFX10N-NEXT: s_load_dword s0, s[4:5], 0x18
+; GFX10N-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10N-NEXT: s_add_i32 s0, s0, 1
+; GFX10N-NEXT: v_mul_lo_u32 v1, s0, v0
+; GFX10N-NEXT: v_add_nc_u32_e32 v2, s0, v1
+; GFX10N-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10N-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10N-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX10N-NEXT: v_mul_lo_u32 v2, v2, v0
+; GFX10N-NEXT: v_mul_lo_u32 v3, v2, v1
+; GFX10N-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10N-NEXT: s_load_dword s2, s[2:3], 0x4
+; GFX10N-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX10N-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10N-NEXT: v_mul_lo_u32 v2, v1, v2
+; GFX10N-NEXT: v_add_nc_u32_e32 v1, 1, v3
+; GFX10N-NEXT: v_mul_lo_u32 v4, v2, v1
+; GFX10N-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10N-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX10N-NEXT: v_add_nc_u32_e32 v3, v4, v1
+; GFX10N-NEXT: v_mad_u64_u32 v[0:1], null, s8, s2, v[0:1]
+; GFX10N-NEXT: v_mul_lo_u32 v1, v3, v2
+; GFX10N-NEXT: v_add_co_u32 v2, s0, s0, v0
+; GFX10N-NEXT: v_add_co_ci_u32_e64 v3, null, s1, 0, s0
+; GFX10N-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2]
+; GFX10N-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10N-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5]
+; GFX10N-NEXT: v_add_co_u32 v1, vcc_lo, s4, v2
+; GFX10N-NEXT: v_add_co_ci_u32_e64 v2, null, s5, v3, vcc_lo
+; GFX10N-NEXT: global_store_dword v[1:2], v0, off
+; GFX10N-NEXT: s_endpgm
+bb:
+ %i = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+ %i2 = add i32 %arg1, 1
+ %i3 = mul i32 %i2, %i
+ %i4 = add i32 %i3, %i2
+ %i5 = mul i32 %i4, %i
+ %i6 = add i32 %i3, 1
+ %i7 = mul i32 %i5, %i6
+ %i8 = add i32 %i7, %i6
+ %i9 = mul i32 %i8, %i5
+ %i10 = add i32 %i7, 1
+ %i11 = mul i32 %i9, %i10
+ %i12 = add i32 %i11, %i10
+ %i13 = mul i32 %i12, %i9
+ %i14 = add i32 %i11, 1
+ %i15 = add i32 %i13, 1
+ %i16 = mul i32 %i13, %i14
+ %i17 = mul i32 %i16, %i15
+ %i19 = load i64, ptr addrspace(4) %i18, align 8
+ %i20 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %i22 = getelementptr i8, ptr addrspace(4) %i21, i64 4
+ %i23 = load i16, ptr addrspace(4) %i22, align 4
+ %i24 = zext i16 %i23 to i32
+ %i25 = mul i32 %i20, %i24
+ %i26 = add i32 %i25, %i
+ %i27 = zext i32 %i26 to i64
+ %i28 = add i64 %i19, %i27
+ %i29 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %i28
+ store i32 %i17, ptr addrspace(1) %i29, align 4
+ ret void
+}
+
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #2
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #2
+declare i32 @llvm.amdgcn.workgroup.id.x() #2
+declare i64 @llvm.amdgcn.mul.u24(i32, i32)
+declare i64 @llvm.amdgcn.mul.i24(i32, i32)
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(read, argmem: readwrite, inaccessiblemem: none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!0 = !{i32 0, i32 1024}
>From a05a36082a87cb97109ca9c89f1fe6b5697366d6 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 21 May 2025 19:11:19 -0500
Subject: [PATCH 6/6] NFC formatting
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d6e646c8ffc15..8f7dd96b1c0cb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -593,14 +593,12 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
// Don't cluster scalar and vecter memory ops
const MachineFunction &MF = *FirstLdSt.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- if (FirstLdSt.getOperand(0).isReg() &&
- SecondLdSt.getOperand(0).isReg()) {
- bool isFirstVecReg = RI.isVectorRegister(MRI,
- FirstLdSt.getOperand(0).getReg());
- bool isSecondVecReg = RI.isVectorRegister(MRI,
- SecondLdSt.getOperand(0).getReg());
- if ((isFirstVecReg && !isSecondVecReg) ||
- (!isFirstVecReg && isSecondVecReg))
+ if (FirstLdSt.getOperand(0).isReg() && SecondLdSt.getOperand(0).isReg()) {
+ bool isFirstVecReg =
+ RI.isVectorRegister(MRI, FirstLdSt.getOperand(0).getReg());
+ bool isSecondVecReg =
+ RI.isVectorRegister(MRI, SecondLdSt.getOperand(0).getReg());
+ if (isFirstVecReg ^ isSecondVecReg)
return false;
}
} else {
More information about the llvm-commits
mailing list