[llvm] b668b64 - [AMDGPU][True16][CodeGen] legalize 16bit and 32bit use-def chain for moveToVALU in si-fix-sgpr-lowering (#138734)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 4 06:53:15 PDT 2025
Author: Brox Chen
Date: 2025-06-04T09:53:10-04:00
New Revision: b668b6439acb9057faadd788c9351a7d20bf140e
URL: https://github.com/llvm/llvm-project/commit/b668b6439acb9057faadd788c9351a7d20bf140e
DIFF: https://github.com/llvm/llvm-project/commit/b668b6439acb9057faadd788c9351a7d20bf140e.diff
LOG: [AMDGPU][True16][CodeGen] legalize 16bit and 32bit use-def chain for moveToVALU in si-fix-sgpr-lowering (#138734)
Two changes in this patch:
1. Covered another case in legalizeOperandVALUt16 functions and the COPY
lowering, when SALU16 is used by SALU32, need to insert a reg_sequence
after moved to valu (previously only considered SALU32 used by SALU16
case)
2. Moved the useMI analysis into addUsersToMoveVALUList. Legalize the
targetted operand when needed.
Turn on frem test with true16 mode for gfx1150 which is failing before
this patch. A few bitcast tests also impacted by this change with some
v_mov being replaced to dual mov
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
llvm/test/CodeGen/AMDGPU/frem.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8937679e460f3..a27d4eeee97f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7227,27 +7227,52 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
return DeferredList.contains(MI);
}
-// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
-// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
-// subreg access properly. This can be removed after we have sgpr16 in place
-void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
+// Legalize size mismatches between 16bit and 32bit registers in v2s copy
+// lowering (change spgr to vgpr).
+// This is mainly caused by 16bit SALU and 16bit VALU using reg with
diff erent
+// size. Need to legalize the size of the operands during the vgpr lowering
+// chain. This can be removed after we have sgpr16 in place
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
MachineRegisterInfo &MRI) const {
- unsigned Opcode = Inst.getOpcode();
- if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
+ if (!ST.useRealTrue16Insts())
return;
- for (MachineOperand &Op : Inst.explicit_operands()) {
- unsigned OpIdx = Op.getOperandNo();
- if (!OpIdx)
- continue;
- if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
- unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- const TargetRegisterClass *RC = RI.getRegClass(RCID);
- if (RI.getRegSizeInBits(*RC) == 16) {
- Op.setSubReg(AMDGPU::lo16);
- }
- }
- }
+ unsigned Opcode = MI.getOpcode();
+ MachineBasicBlock *MBB = MI.getParent();
+ // Legalize operands and check for size mismatch
+ if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
+ OpIdx >= get(Opcode).getNumOperands())
+ return;
+
+ MachineOperand &Op = MI.getOperand(OpIdx);
+ if (!Op.isReg() || !Op.getReg().isVirtual())
+ return;
+
+ const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
+ if (!RI.isVGPRClass(CurrRC))
+ return;
+
+ unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
+ const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+ if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
+ Op.setSubReg(AMDGPU::lo16);
+ } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
+ .addReg(Op.getReg())
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ Op.setReg(NewDstReg);
+ }
+}
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
+ MachineRegisterInfo &MRI) const {
+ for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
+ legalizeOperandsVALUt16(MI, OpIdx, MRI);
}
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
@@ -7769,15 +7794,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- // If this is a v2s copy src from 16bit to 32bit,
- // replace vgpr copy to reg_sequence
+ // If this is a v2s copy between 16bit and 32bit reg,
+ // replace vgpr copy to reg_sequence/extract_subreg
// This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
Inst.getOperand(1).getReg().isVirtual() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
- if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
- 32 == RI.getRegSizeInBits(*NewDstRC)) {
+ if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
@@ -7789,7 +7813,13 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.addReg(Undef)
.addImm(AMDGPU::hi16);
Inst.eraseFromParent();
-
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ return;
+ } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
+ AMDGPU::lo16)) {
+ Inst.getOperand(1).setSubReg(AMDGPU::lo16);
+ Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
@@ -7885,23 +7915,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
assert(NewDstRC);
NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
-
- // Check useMI of NewInstr. If used by a true16 instruction,
- // add a lo16 subreg access if size mismatched
- // This can be remove after we have sgpr16 in place
- if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
- for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
- E = MRI.use_end();
- I != E; ++I) {
- MachineInstr &UseMI = *I->getParent();
- unsigned UseMIOpcode = UseMI.getOpcode();
- if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
- (16 ==
- RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
- I->setSubReg(AMDGPU::lo16);
- }
- }
- }
}
fixImplicitOperands(*NewInstr);
@@ -8709,6 +8722,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
++I;
} while (I != E && I->getParent() == &UseMI);
} else {
+ legalizeOperandsVALUt16(UseMI, OpNo, MRI);
+
++I;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 64ab064a75f44..01dd3c9f4119e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1304,6 +1304,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// Fix operands in Inst to fix 16bit SALU to VALU lowering.
void legalizeOperandsVALUt16(MachineInstr &Inst,
MachineRegisterInfo &MRI) const;
+ void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
+ MachineRegisterInfo &MRI) const;
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 44abfd272be88..9126b08857153 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -21659,134 +21659,119 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
@@ -21799,305 +21784,305 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -22184,31 +22169,31 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -22224,61 +22209,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -22289,63 +22274,63 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -22356,61 +22341,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -22421,61 +22406,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -22486,52 +22471,52 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -60371,134 +60356,119 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
@@ -60511,305 +60481,305 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -60896,31 +60866,31 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -60936,61 +60906,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -61001,63 +60971,63 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -61068,61 +61038,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -61133,61 +61103,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -61198,52 +61168,52 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -97102,134 +97072,119 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
@@ -97242,305 +97197,305 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -97627,31 +97582,31 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -97667,61 +97622,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -97732,63 +97687,63 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -97799,61 +97754,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -97864,61 +97819,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -97929,52 +97884,52 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -133776,134 +133731,119 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
@@ -133916,305 +133856,305 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -134301,31 +134241,31 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -134341,61 +134281,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -134406,63 +134346,63 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -134473,61 +134413,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -134538,61 +134478,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -134603,52 +134543,52 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
@@ -160847,8 +160787,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
@@ -160898,72 +160839,56 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB89_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -161021,20 +160946,20 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -161042,48 +160967,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
@@ -161428,84 +161353,84 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5
@@ -161611,16 +161536,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -161639,7 +161564,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
@@ -188892,8 +188817,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
@@ -188943,72 +188869,56 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB93_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -189066,20 +188976,20 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -189087,48 +188997,48 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
@@ -189473,84 +189383,84 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5
@@ -189656,16 +189566,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -189684,7 +189594,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
@@ -212549,8 +212459,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304
@@ -212600,72 +212511,56 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -212723,20 +212618,20 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -212744,48 +212639,48 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19
@@ -213130,84 +213025,84 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5
@@ -213313,16 +213208,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
@@ -213341,7 +213236,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 178718a338432..8dc00701dcfd6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -7393,15 +7393,10 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4
@@ -7441,27 +7436,27 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
@@ -7470,14 +7465,14 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -7535,22 +7530,22 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
@@ -14771,15 +14766,10 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
@@ -14819,27 +14809,27 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
@@ -14848,14 +14838,14 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -14913,22 +14903,22 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
@@ -21656,15 +21646,10 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4
@@ -21704,27 +21689,27 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
@@ -21733,14 +21718,14 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -21798,22 +21783,22 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
@@ -28039,15 +28024,10 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4
@@ -28087,27 +28067,27 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
@@ -28116,14 +28096,14 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -28181,22 +28161,22 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
@@ -34105,20 +34085,10 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4
@@ -34149,25 +34119,25 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
@@ -34231,42 +34201,42 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -34280,7 +34250,7 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -39305,20 +39275,10 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4
@@ -39349,25 +39309,25 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
@@ -39431,42 +39391,42 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -39480,7 +39440,7 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -43653,20 +43613,10 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4
@@ -43697,25 +43647,25 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
@@ -43779,42 +43729,42 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
@@ -43828,7 +43778,7 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index d966d136d75b6..73c730f3c30dd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -6469,17 +6469,11 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v23, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v5 :: v_dual_mov_b32 v27, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_mov_b32 v31, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
@@ -6509,7 +6503,7 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -6527,29 +6521,29 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19
@@ -6557,9 +6551,9 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v22, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v36, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -6568,12 +6562,12 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -6631,36 +6625,36 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v28
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
@@ -13954,17 +13948,11 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v23, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v5 :: v_dual_mov_b32 v27, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_mov_b32 v31, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
@@ -13994,7 +13982,7 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -14012,29 +14000,29 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19
@@ -14042,9 +14030,9 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v22, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v36, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -14053,12 +14041,12 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
@@ -14116,36 +14104,36 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v28
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
@@ -21021,28 +21009,14 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v15 :: v_dual_mov_b32 v25, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v27, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v24, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v7 :: v_dual_mov_b32 v26, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v5 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v3 :: v_dual_mov_b32 v38, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v37, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
@@ -21073,44 +21047,44 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5
@@ -21170,61 +21144,61 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v32
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v37
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -27569,28 +27543,14 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v15 :: v_dual_mov_b32 v25, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v27, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v36, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v24, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v7 :: v_dual_mov_b32 v26, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v5 :: v_dual_mov_b32 v35, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v3 :: v_dual_mov_b32 v38, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v37, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4
@@ -27621,44 +27581,44 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5
@@ -27718,61 +27678,61 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v32
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v37
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -31989,23 +31949,14 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v15 :: v_dual_mov_b32 v23, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v14 :: v_dual_mov_b32 v25, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v11 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v29, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v8 :: v_dual_mov_b32 v31, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v4 :: v_dual_mov_b32 v35, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_4
@@ -32035,7 +31986,7 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36
; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -32053,29 +32004,29 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19
@@ -32157,49 +32108,49 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v34
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v32
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17
@@ -36633,23 +36584,14 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v15 :: v_dual_mov_b32 v23, v13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v14 :: v_dual_mov_b32 v25, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v11 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v29, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v8 :: v_dual_mov_b32 v31, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v4 :: v_dual_mov_b32 v35, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_4
@@ -36679,7 +36621,7 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36
; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -36697,29 +36639,29 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19
@@ -36801,49 +36743,49 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36
; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v34
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v32
; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 397955a8a8928..ca27410a1c127 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -15124,42 +15124,34 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -15196,37 +15188,37 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
@@ -15267,28 +15259,28 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -15355,7 +15347,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54
; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
@@ -15367,14 +15359,14 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
@@ -15387,31 +15379,31 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -15461,42 +15453,42 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -30479,42 +30471,34 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -30551,37 +30535,37 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
@@ -30622,28 +30606,28 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -30710,7 +30694,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54
; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
@@ -30722,14 +30706,14 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
@@ -30742,31 +30726,31 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -30816,42 +30800,42 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -45105,42 +45089,34 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -45177,37 +45153,37 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
@@ -45248,28 +45224,28 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -45336,7 +45312,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54
; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
@@ -45348,14 +45324,14 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
@@ -45368,31 +45344,31 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -45442,42 +45418,42 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -58885,42 +58861,34 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -58957,37 +58925,37 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7
@@ -59028,28 +58996,28 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87
@@ -59116,7 +59084,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54
; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
@@ -59128,14 +59096,14 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
@@ -59148,31 +59116,31 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
@@ -59222,42 +59190,42 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15
@@ -72878,57 +72846,34 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -72952,8 +72897,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
@@ -72961,14 +72906,14 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
@@ -72977,10 +72922,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
@@ -72988,63 +72933,63 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -73103,151 +73048,151 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16
@@ -85910,57 +85855,34 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -85984,8 +85906,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
@@ -85993,14 +85915,14 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
@@ -86009,10 +85931,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
@@ -86020,63 +85942,63 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -86135,151 +86057,151 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16
@@ -97280,57 +97202,34 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32bf16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0
; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -97354,8 +97253,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
@@ -97363,14 +97262,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
@@ -97379,10 +97278,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
@@ -97390,63 +97289,63 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11
@@ -97505,151 +97404,151 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85
; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5
; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index b27ad26cf97b9..4cb5b7c43a46d 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -77,18 +77,19 @@ define amdgpu_kernel void @br_cc_f16(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.h, v1.h
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB0_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
-; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT: s_endpgm
+; GFX11-TRUE16-NEXT: s_branch .LBB0_3
; GFX11-TRUE16-NEXT: .LBB0_2: ; %two
-; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: .LBB0_3: ; %one
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: br_cc_f16:
@@ -192,13 +193,15 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0.h
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
+; GFX11-TRUE16-NEXT: s_branch .LBB1_3
; GFX11-TRUE16-NEXT: .LBB1_2: ; %two
+; GFX11-TRUE16-NEXT: .LBB1_3: ; %one
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -298,13 +301,15 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v1.l
-; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_2
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %two
+; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0.h
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB2_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
+; GFX11-TRUE16-NEXT: s_branch .LBB2_3
+; GFX11-TRUE16-NEXT: .LBB2_2: ; %two
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
-; GFX11-TRUE16-NEXT: .LBB2_2: ; %one
+; GFX11-TRUE16-NEXT: .LBB2_3: ; %one
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index f9db082a2e912..9b6a2f3a1aa1e 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -57,6 +57,57 @@ body: |
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...
+---
+name: salu16_usedby_salu32
+body: |
+ bb.0:
+ ; GCN-LABEL: name: salu16_usedby_salu32
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[REG_SEQUENCE]], [[DEF]], implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+ %3:sreg_32 = S_XOR_B32 %2:sreg_32, %1:sreg_32, implicit-def $scc
+...
+
+---
+name: salu32_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: salu32_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[DEF]], [[DEF]], implicit $exec
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[V_XOR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_XOR_B32 %1:sreg_32, %1:sreg_32, implicit-def $scc
+ %3:sreg_32 = S_TRUNC_F16 %2:sreg_32, implicit $mode
+...
+
+---
+name: S_FMAC_F16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: S_FMAC_F16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_lo16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF3]], %subreg.hi16
+ ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_16 = IMPLICIT_DEF
+ %1:sgpr_lo16 = COPY %0:vgpr_16
+ %2:sreg_32 = COPY %0:vgpr_16
+ %3:sreg_32 = COPY %1:sgpr_lo16
+ %4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode
+...
+
---
name: vgpr16_to_spgr32
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 125d009429cbf..7a1351174733b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -6,7 +6,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; SI-LABEL: frem_f16:
@@ -255,42 +256,81 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -456,26 +496,47 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: fast_frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: fast_frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: fast_frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -641,26 +702,47 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: unsafe_frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -2308,68 +2390,130 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_v2f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5
-; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5
-; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5
-; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1
-; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v2f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1150-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v2f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3034,115 +3178,226 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_v4f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v4, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v8, v8
-; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8
-; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0
-; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0
-; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v4f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v4f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v8
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
More information about the llvm-commits
mailing list