[llvm] [AMDGPU][True16][CodeGen] legalize 16bit and 32bit use-def chain for moveToVALU in si-fix-sgpr-lowering (PR #138734)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 2 21:45:38 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/138734
>From 54454329834a6e8a643580633a9acb54205cc5d8 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 6 May 2025 14:30:12 -0400
Subject: [PATCH 1/4] check for vgpr16 putting into vgpr32 case in v2s lowering
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 53 +-
.../AMDGPU/fix-sgpr-copies-f16-true16.mir | 51 ++
llvm/test/CodeGen/AMDGPU/frem.ll | 751 ++++++++++++------
3 files changed, 596 insertions(+), 259 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8937679e460f3..9c962ddd1832e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7227,24 +7227,44 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
return DeferredList.contains(MI);
}
-// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
-// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
-// subreg access properly. This can be removed after we have sgpr16 in place
-void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
+// legalize operand between 16bit and 32bit registers in v2s copy
+// lowering (change spgr to vgpr).
+// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
+// size. Need to legalize the size of the operands during the vgpr lowering
+// chain. This can be removed after we have sgpr16 in place
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
MachineRegisterInfo &MRI) const {
- unsigned Opcode = Inst.getOpcode();
- if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
+ if (!ST.useRealTrue16Insts())
return;
- for (MachineOperand &Op : Inst.explicit_operands()) {
+ unsigned Opcode = MI.getOpcode();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ // legalize operands and check for size mismatch
+ for (MachineOperand &Op : MI.explicit_operands()) {
unsigned OpIdx = Op.getOperandNo();
if (!OpIdx)
continue;
- if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
+ if (Op.isReg() && Op.getReg().isVirtual() && RI.isVGPR(MRI, Op.getReg())) {
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- const TargetRegisterClass *RC = RI.getRegClass(RCID);
- if (RI.getRegSizeInBits(*RC) == 16) {
+ const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+ const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
+ if (32 == RI.getRegSizeInBits(*RC) &&
+ 16 == RI.getRegSizeInBits(*ExpectedRC)) {
Op.setSubReg(AMDGPU::lo16);
+ } else if (16 == RI.getRegSizeInBits(*RC) &&
+ 32 == RI.getRegSizeInBits(*ExpectedRC)) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register NewDstReg =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
+ .addReg(Op.getReg())
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ Op.setReg(NewDstReg);
}
}
}
@@ -7789,8 +7809,19 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.addReg(Undef)
.addImm(AMDGPU::hi16);
Inst.eraseFromParent();
-
MRI.replaceRegWith(DstReg, NewDstReg);
+ // legalize useMI with mismatched size
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+ E = MRI.use_end();
+ I != E; ++I) {
+ MachineInstr &UseMI = *I->getParent();
+ unsigned UseMIOpcode = UseMI.getOpcode();
+ if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
+ (16 ==
+ RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
+ I->setSubReg(AMDGPU::lo16);
+ }
+ }
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index f9db082a2e912..9b6a2f3a1aa1e 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -57,6 +57,57 @@ body: |
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...
+---
+name: salu16_usedby_salu32
+body: |
+ bb.0:
+ ; GCN-LABEL: name: salu16_usedby_salu32
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[REG_SEQUENCE]], [[DEF]], implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+ %3:sreg_32 = S_XOR_B32 %2:sreg_32, %1:sreg_32, implicit-def $scc
+...
+
+---
+name: salu32_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: salu32_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[DEF]], [[DEF]], implicit $exec
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[V_XOR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_XOR_B32 %1:sreg_32, %1:sreg_32, implicit-def $scc
+ %3:sreg_32 = S_TRUNC_F16 %2:sreg_32, implicit $mode
+...
+
+---
+name: S_FMAC_F16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: S_FMAC_F16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_lo16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF3]], %subreg.hi16
+ ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_16 = IMPLICIT_DEF
+ %1:sgpr_lo16 = COPY %0:vgpr_16
+ %2:sreg_32 = COPY %0:vgpr_16
+ %3:sreg_32 = COPY %1:sgpr_lo16
+ %4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode
+...
+
---
name: vgpr16_to_spgr32
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 125d009429cbf..7a1351174733b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -6,7 +6,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; SI-LABEL: frem_f16:
@@ -255,42 +256,81 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -456,26 +496,47 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: fast_frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: fast_frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: fast_frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -641,26 +702,47 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: unsafe_frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -2308,68 +2390,130 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_v2f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5
-; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5
-; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5
-; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1
-; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v2f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1150-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v2f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3034,115 +3178,226 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_v4f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v4, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v8, v8
-; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8
-; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0
-; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0
-; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v4f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v4f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v8
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
>From a2cab8959f27169c9c5038473c646fcf12e786d5 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 12 May 2025 14:22:58 -0400
Subject: [PATCH 2/4] address comment
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9c962ddd1832e..95fd084b2edc5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7245,15 +7245,16 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
unsigned OpIdx = Op.getOperandNo();
if (!OpIdx)
continue;
- if (Op.isReg() && Op.getReg().isVirtual() && RI.isVGPR(MRI, Op.getReg())) {
- unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+ if (Op.isReg() && Op.getReg().isVirtual()) {
const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
- if (32 == RI.getRegSizeInBits(*RC) &&
- 16 == RI.getRegSizeInBits(*ExpectedRC)) {
+ if (!RI.isVGPRClass(RC))
+ continue;
+ unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
+ unsigned expectedSize = RI.getRegSizeInBits(*RI.getRegClass(RCID));
+ unsigned currSize = RI.getRegSizeInBits(*RC);
+ if (expectedSize == 16 && currSize == 32) {
Op.setSubReg(AMDGPU::lo16);
- } else if (16 == RI.getRegSizeInBits(*RC) &&
- 32 == RI.getRegSizeInBits(*ExpectedRC)) {
+ } else if (expectedSize == 32 && currSize == 16) {
const DebugLoc &DL = MI.getDebugLoc();
Register NewDstReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
>From fc6ad7281e0da833daa062ae3b33b1f5ef62b3e4 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 22 May 2025 11:59:22 -0400
Subject: [PATCH 3/4] address comment
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 95fd084b2edc5..46dff5a1ea2f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7246,15 +7246,14 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
if (!OpIdx)
continue;
if (Op.isReg() && Op.getReg().isVirtual()) {
- const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
- if (!RI.isVGPRClass(RC))
+ const TargetRegisterClass *DefRC = MRI.getRegClass(Op.getReg());
+ if (!RI.isVGPRClass(DefRC))
continue;
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- unsigned expectedSize = RI.getRegSizeInBits(*RI.getRegClass(RCID));
- unsigned currSize = RI.getRegSizeInBits(*RC);
- if (expectedSize == 16 && currSize == 32) {
+ const TargetRegisterClass *UseRC = RI.getRegClass(RCID);
+ if (RI.getMatchingSuperRegClass(DefRC, UseRC, AMDGPU::lo16)) {
Op.setSubReg(AMDGPU::lo16);
- } else if (expectedSize == 32 && currSize == 16) {
+ } else if (RI.getMatchingSuperRegClass(UseRC, DefRC, AMDGPU::lo16)) {
const DebugLoc &DL = MI.getDebugLoc();
Register NewDstReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
>From a921e8b6308f9128161a2b9d3fb96d09c56da68b Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 2 Jun 2025 17:15:50 -0400
Subject: [PATCH 4/4] patch 2
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 108 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 3315 ++++++++---------
.../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 436 +--
.../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 554 ++-
.../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 1649 ++++----
llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 35 +-
7 files changed, 2889 insertions(+), 3210 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 46dff5a1ea2f4..3ead63864b84e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7232,43 +7232,50 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
// size. Need to legalize the size of the operands during the vgpr lowering
// chain. This can be removed after we have sgpr16 in place
-void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
MachineRegisterInfo &MRI) const {
if (!ST.useRealTrue16Insts())
return;
unsigned Opcode = MI.getOpcode();
MachineBasicBlock *MBB = MI.getParent();
-
// legalize operands and check for size mismatch
- for (MachineOperand &Op : MI.explicit_operands()) {
- unsigned OpIdx = Op.getOperandNo();
- if (!OpIdx)
- continue;
- if (Op.isReg() && Op.getReg().isVirtual()) {
- const TargetRegisterClass *DefRC = MRI.getRegClass(Op.getReg());
- if (!RI.isVGPRClass(DefRC))
- continue;
- unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- const TargetRegisterClass *UseRC = RI.getRegClass(RCID);
- if (RI.getMatchingSuperRegClass(DefRC, UseRC, AMDGPU::lo16)) {
- Op.setSubReg(AMDGPU::lo16);
- } else if (RI.getMatchingSuperRegClass(UseRC, DefRC, AMDGPU::lo16)) {
- const DebugLoc &DL = MI.getDebugLoc();
- Register NewDstReg =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
- .addReg(Op.getReg())
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- Op.setReg(NewDstReg);
- }
- }
+ if (!OpIdx || OpIdx >= MI.getNumExplicitOperands())
+ return;
+
+ MachineOperand &Op = MI.getOperand(OpIdx);
+ if (!Op.isReg() || !Op.getReg().isVirtual())
+ return;
+
+ const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
+ if (!RI.isVGPRClass(CurrRC))
+ return;
+
+ if (OpIdx >= get(Opcode).getNumOperands())
+ return;
+
+ unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
+ const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+ if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
+ Op.setSubReg(AMDGPU::lo16);
+ } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
+ .addReg(Op.getReg())
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ Op.setReg(NewDstReg);
}
}
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
+ MachineRegisterInfo &MRI) const {
+ for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
+ legalizeOperandsVALUt16(MI, OpIdx, MRI);
+}
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {
@@ -7789,15 +7796,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- // If this is a v2s copy src from 16bit to 32bit,
- // replace vgpr copy to reg_sequence
+ // If this is a v2s copy between 16bit and 32bit reg,
+ // replace vgpr copy to reg_sequence/extract_subreg
// This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
Inst.getOperand(1).getReg().isVirtual() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
- if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
- 32 == RI.getRegSizeInBits(*NewDstRC)) {
+ if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
@@ -7810,18 +7816,13 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.addImm(AMDGPU::hi16);
Inst.eraseFromParent();
MRI.replaceRegWith(DstReg, NewDstReg);
- // legalize useMI with mismatched size
- for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
- E = MRI.use_end();
- I != E; ++I) {
- MachineInstr &UseMI = *I->getParent();
- unsigned UseMIOpcode = UseMI.getOpcode();
- if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
- (16 ==
- RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
- I->setSubReg(AMDGPU::lo16);
- }
- }
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ return;
+ } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
+ AMDGPU::lo16)) {
+ Inst.getOperand(1).setSubReg(AMDGPU::lo16);
+ Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
}
@@ -7916,23 +7917,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
assert(NewDstRC);
NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
-
- // Check useMI of NewInstr. If used by a true16 instruction,
- // add a lo16 subreg access if size mismatched
- // This can be remove after we have sgpr16 in place
- if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
- for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
- E = MRI.use_end();
- I != E; ++I) {
- MachineInstr &UseMI = *I->getParent();
- unsigned UseMIOpcode = UseMI.getOpcode();
- if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
- (16 ==
- RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
- I->setSubReg(AMDGPU::lo16);
- }
- }
- }
}
fixImplicitOperands(*NewInstr);
@@ -8740,6 +8724,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
++I;
} while (I != E && I->getParent() == &UseMI);
} else {
+ legalizeOperandsVALUt16(UseMI, OpNo, MRI);
+
++I;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 64ab064a75f44..01dd3c9f4119e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1304,6 +1304,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// Fix operands in Inst to fix 16bit SALU to VALU lowering.
void legalizeOperandsVALUt16(MachineInstr &Inst,
MachineRegisterInfo &MRI) const;
+ void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
+ MachineRegisterInfo &MRI) const;
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 44abfd272be88..9126b08857153 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -21659,134 +21659,119 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
@@ -21799,305 +21784,305 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -22184,31 +22169,31 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -22224,61 +22209,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -22289,63 +22274,63 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -22356,61 +22341,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -22421,61 +22406,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -22486,52 +22471,52 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -60371,134 +60356,119 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
@@ -60511,305 +60481,305 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -60896,31 +60866,31 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -60936,61 +60906,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -61001,63 +60971,63 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -61068,61 +61038,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -61133,61 +61103,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -61198,52 +61168,52 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -97102,134 +97072,119 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
@@ -97242,305 +97197,305 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -97627,31 +97582,31 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -97667,61 +97622,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -97732,63 +97687,63 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -97799,61 +97754,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -97864,61 +97819,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -97929,52 +97884,52 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -133776,134 +133731,119 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
@@ -133916,305 +133856,305 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -134301,31 +134241,31 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -134341,61 +134281,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -134406,63 +134346,63 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -134473,61 +134413,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -134538,61 +134478,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -134603,52 +134543,52 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -160847,8 +160787,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
@@ -160898,72 +160839,56 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB89_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -161021,20 +160946,20 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -161042,48 +160967,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
@@ -161428,84 +161353,84 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5
@@ -161611,16 +161536,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -161639,7 +161564,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
@@ -188892,8 +188817,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
@@ -188943,72 +188869,56 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB93_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -189066,20 +188976,20 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -189087,48 +188997,48 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
@@ -189473,84 +189383,84 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5
@@ -189656,16 +189566,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -189684,7 +189594,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
@@ -212549,8 +212459,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
@@ -212600,72 +212511,56 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -212723,20 +212618,20 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -212744,48 +212639,48 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
@@ -213130,84 +213025,84 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5
@@ -213313,16 +213208,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -213341,7 +213236,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 178718a338432..8dc00701dcfd6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -7393,15 +7393,10 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4
@@ -7441,27 +7436,27 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
@@ -7470,14 +7465,14 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -7535,22 +7530,22 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
@@ -14771,15 +14766,10 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
@@ -14819,27 +14809,27 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
@@ -14848,14 +14838,14 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -14913,22 +14903,22 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
@@ -21656,15 +21646,10 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4
@@ -21704,27 +21689,27 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
@@ -21733,14 +21718,14 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -21798,22 +21783,22 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
@@ -28039,15 +28024,10 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4
@@ -28087,27 +28067,27 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
@@ -28116,14 +28096,14 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -28181,22 +28161,22 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
@@ -34105,20 +34085,10 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4
@@ -34149,25 +34119,25 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
@@ -34231,42 +34201,42 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -34280,7 +34250,7 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -39305,20 +39275,10 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4
@@ -39349,25 +39309,25 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
@@ -39431,42 +39391,42 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -39480,7 +39440,7 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -43653,20 +43613,10 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4
@@ -43697,25 +43647,25 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
@@ -43779,42 +43729,42 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -43828,7 +43778,7 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index d966d136d75b6..73c730f3c30dd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -6469,17 +6469,11 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v23, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v5 :: v_dual_mov_b32 v27, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_mov_b32 v31, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
@@ -6509,7 +6503,7 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -6527,29 +6521,29 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19
@@ -6557,9 +6551,9 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v22, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v36, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -6568,12 +6562,12 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -6631,36 +6625,36 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v28
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
@@ -13954,17 +13948,11 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v23, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v5 :: v_dual_mov_b32 v27, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_mov_b32 v31, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
@@ -13994,7 +13982,7 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -14012,29 +14000,29 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19
@@ -14042,9 +14030,9 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v22, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v36, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -14053,12 +14041,12 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -14116,36 +14104,36 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v28
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
@@ -21021,28 +21009,14 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v15 :: v_dual_mov_b32 v25, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v27, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v24, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v7 :: v_dual_mov_b32 v26, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v5 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v3 :: v_dual_mov_b32 v38, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v37, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
@@ -21073,44 +21047,44 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5
@@ -21170,61 +21144,61 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v32
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v37
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -27569,28 +27543,14 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v15 :: v_dual_mov_b32 v25, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v27, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v24, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v7 :: v_dual_mov_b32 v26, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v5 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v3 :: v_dual_mov_b32 v38, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v37, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4
@@ -27621,44 +27581,44 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5
@@ -27718,61 +27678,61 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v32
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v37
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -31989,23 +31949,14 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v15 :: v_dual_mov_b32 v23, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v14 :: v_dual_mov_b32 v25, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v11 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v29, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v8 :: v_dual_mov_b32 v31, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v4 :: v_dual_mov_b32 v35, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_4
@@ -32035,7 +31986,7 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36
; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -32053,29 +32004,29 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19
@@ -32157,49 +32108,49 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v34
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v32
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17
@@ -36633,23 +36584,14 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v15 :: v_dual_mov_b32 v23, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v14 :: v_dual_mov_b32 v25, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v11 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v29, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v8 :: v_dual_mov_b32 v31, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v4 :: v_dual_mov_b32 v35, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_4
@@ -36679,7 +36621,7 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36
; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -36697,29 +36639,29 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19
@@ -36801,49 +36743,49 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v34
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v32
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 397955a8a8928..ca27410a1c127 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -15124,42 +15124,34 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -15196,37 +15188,37 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
@@ -15267,28 +15259,28 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -15355,7 +15347,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54
; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
@@ -15367,14 +15359,14 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
@@ -15387,31 +15379,31 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -15461,42 +15453,42 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -30479,42 +30471,34 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -30551,37 +30535,37 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
@@ -30622,28 +30606,28 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -30710,7 +30694,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54
; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
@@ -30722,14 +30706,14 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
@@ -30742,31 +30726,31 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -30816,42 +30800,42 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -45105,42 +45089,34 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -45177,37 +45153,37 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
@@ -45248,28 +45224,28 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -45336,7 +45312,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54
; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
@@ -45348,14 +45324,14 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
@@ -45368,31 +45344,31 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -45442,42 +45418,42 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -58885,42 +58861,34 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -58957,37 +58925,37 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
@@ -59028,28 +58996,28 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -59116,7 +59084,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54
; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
@@ -59128,14 +59096,14 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
@@ -59148,31 +59116,31 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -59222,42 +59190,42 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -72878,57 +72846,34 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -72952,8 +72897,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
@@ -72961,14 +72906,14 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
@@ -72977,10 +72922,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
@@ -72988,63 +72933,63 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -73103,151 +73048,151 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16
@@ -85910,57 +85855,34 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -85984,8 +85906,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
@@ -85993,14 +85915,14 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
@@ -86009,10 +85931,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
@@ -86020,63 +85942,63 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -86135,151 +86057,151 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16
@@ -97280,57 +97202,34 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32bf16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -97354,8 +97253,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
@@ -97363,14 +97262,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
@@ -97379,10 +97278,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
@@ -97390,63 +97289,63 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -97505,151 +97404,151 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index b27ad26cf97b9..4cb5b7c43a46d 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -77,18 +77,19 @@ define amdgpu_kernel void @br_cc_f16(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.h, v1.h
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB0_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
-; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT: s_endpgm
+; GFX11-TRUE16-NEXT: s_branch .LBB0_3
; GFX11-TRUE16-NEXT: .LBB0_2: ; %two
-; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: .LBB0_3: ; %one
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: br_cc_f16:
@@ -192,13 +193,15 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0.h
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
+; GFX11-TRUE16-NEXT: s_branch .LBB1_3
; GFX11-TRUE16-NEXT: .LBB1_2: ; %two
+; GFX11-TRUE16-NEXT: .LBB1_3: ; %one
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -298,13 +301,15 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v1.l
-; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_2
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %two
+; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0.h
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB2_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
+; GFX11-TRUE16-NEXT: s_branch .LBB2_3
+; GFX11-TRUE16-NEXT: .LBB2_2: ; %two
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
-; GFX11-TRUE16-NEXT: .LBB2_2: ; %one
+; GFX11-TRUE16-NEXT: .LBB2_3: ; %one
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
More information about the llvm-commits
mailing list